From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: core: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c722..50ae922c6022 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); - for_each_cpu_mask(cpu, frozen_cpus) { + for_each_cpu_mask_nr(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { printk("CPU%d is up\n", cpu); -- cgit v1.2.3 From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 15 Jul 2008 04:43:49 -0700 Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2) This is based on Linus' idea of creating cpu_active_map that prevents scheduler load balancer from migrating tasks to the cpu that is going down. It allows us to simplify domain management code and avoid unecessary domain rebuilds during cpu hotplug event handling. Please ignore the cpusets part for now. It needs some more work in order to avoid crazy lock nesting. Although I did simplfy and unify domain reinitialization logic. We now simply call partition_sched_domains() in all the cases. This means that we're using exact same code paths as in cpusets case and hence the test below cover cpusets too. Cpuset changes to make rebuild_sched_domains() callable from various contexts are in the separate patch (right next after this one). This not only boots but also easily handles while true; do make clean; make -j 8; done and while true; do on-off-cpu 1; done at the same time. (on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing). Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing this on right now in gnome-terminal and things are moving just fine. Also this is running with most of the debug features enabled (lockdep, mutex, etc) no BUG_ONs or lockdep complaints so far. I believe I addressed all of the Dmitry's comments for original Linus' version. I changed both fair and rt balancer to mask out non-active cpus. And replaced cpu_is_offline() with !cpu_active() in the main scheduler code where it made sense (to me). Signed-off-by: Max Krasnyanskiy Acked-by: Linus Torvalds Acked-by: Peter Zijlstra Acked-by: Gregory Haskins Cc: dmitry.adamushko@gmail.com Cc: pj@sgi.com Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 6 ++- include/linux/cpuset.h | 7 ++++ init/main.c | 7 ++++ kernel/cpu.c | 30 +++++++++++--- kernel/cpuset.c | 2 +- kernel/sched.c | 108 ++++++++++++++++++++---------------------------- kernel/sched_fair.c | 3 ++ kernel/sched_rt.c | 7 ++++ 8 files changed, 99 insertions(+), 71 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c24875bd9c5b..d614d2472798 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, /* * The following particular system cpumasks and operations manage - * possible, present and online cpus. Each of them is a fixed size + * possible, present, active and online cpus. Each of them is a fixed size * bitmap of size NR_CPUS. * * #ifdef CONFIG_HOTPLUG_CPU * cpu_possible_map - has bit 'cpu' set iff cpu is populatable * cpu_present_map - has bit 'cpu' set iff cpu is populated * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler + * cpu_active_map - has bit 'cpu' set iff cpu available to migration * #else * cpu_possible_map - has bit 'cpu' set iff cpu is populated * cpu_present_map - copy of cpu_possible_map @@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, extern cpumask_t cpu_possible_map; extern cpumask_t cpu_online_map; extern cpumask_t cpu_present_map; +extern cpumask_t cpu_active_map; #if NR_CPUS > 1 #define num_online_cpus() cpus_weight(cpu_online_map) @@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map; #define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) #define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map) #define cpu_present(cpu) cpu_isset((cpu), cpu_present_map) +#define cpu_active(cpu) cpu_isset((cpu), cpu_active_map) #else #define num_online_cpus() 1 #define num_possible_cpus() 1 @@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map; #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) +#define cpu_active(cpu) ((cpu) == 0) #endif #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 038578362b47..e8f450c499b0 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void); extern int current_cpuset_is_being_rebound(void); +extern void rebuild_sched_domains(void); + #else /* !CONFIG_CPUSETS */ static inline int cpuset_init_early(void) { return 0; } @@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void) return 0; } +static inline void rebuild_sched_domains(void) +{ + partition_sched_domains(0, NULL, NULL); +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/init/main.c b/init/main.c index edeace036fd9..dd25259530ea 100644 --- a/init/main.c +++ b/init/main.c @@ -415,6 +415,13 @@ static void __init smp_init(void) { unsigned int cpu; + /* + * Set up the current CPU as possible to migrate to. + * The other ones will be done by cpu_up/cpu_down() + */ + cpu = smp_processor_id(); + cpu_set(cpu, cpu_active_map); + /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) diff --git a/kernel/cpu.c b/kernel/cpu.c index cfb1d43ab801..a1ac7ea245d7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void) cpu_hotplug.refcount = 0; } +cpumask_t cpu_active_map; + #ifdef CONFIG_HOTPLUG_CPU void get_online_cpus(void) @@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu) int err = 0; cpu_maps_update_begin(); - if (cpu_hotplug_disabled) + + if (cpu_hotplug_disabled) { err = -EBUSY; - else - err = _cpu_down(cpu, 0); + goto out; + } + + cpu_clear(cpu, cpu_active_map); + + err = _cpu_down(cpu, 0); + + if (cpu_online(cpu)) + cpu_set(cpu, cpu_active_map); +out: cpu_maps_update_done(); return err; } @@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu) } cpu_maps_update_begin(); - if (cpu_hotplug_disabled) + + if (cpu_hotplug_disabled) { err = -EBUSY; - else - err = _cpu_up(cpu, 0); + goto out; + } + err = _cpu_up(cpu, 0); + + if (cpu_online(cpu)) + cpu_set(cpu, cpu_active_map); + +out: cpu_maps_update_done(); return err; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..3c3ef02f65f1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * partition_sched_domains(). */ -static void rebuild_sched_domains(void) +void rebuild_sched_domains(void) { struct kfifo *q; /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ee18dbb4516..c237624a8a04 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) - || unlikely(cpu_is_offline(dest_cpu))) + || unlikely(!cpu_active(dest_cpu))) goto out; /* force the process onto the specified CPU */ @@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick) /* * If we are going offline and still the leader, give up! */ - if (cpu_is_offline(cpu) && + if (!cpu_active(cpu) && atomic_read(&nohz.load_balancer) == cpu) { if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); @@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) struct rq *rq_dest, *rq_src; int ret = 0, on_rq; - if (unlikely(cpu_is_offline(dest_cpu))) + if (unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); @@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void) { } -/* - * Free current domain masks. - * Called after all cpus are attached to NULL domain. - */ -static void free_sched_domains(void) -{ - ndoms_cur = 0; - if (doms_cur != &fallback_doms) - kfree(doms_cur); - doms_cur = &fallback_doms; -} - /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * ownership of it and will kfree it when done with it. If the caller * failed the kmalloc call, then it can pass in doms_new == NULL, * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms'. + * 'fallback_doms', it also forces the domains to be rebuilt. * * Call with hotplug lock held */ @@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) { - ndoms_new = 1; - doms_new = &fallback_doms; - cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; - } + if (doms_new == NULL) + ndoms_new = 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { @@ -7677,6 +7661,14 @@ match1: ; } + if (doms_new == NULL) { + ndoms_cur = 0; + ndoms_new = 1; + doms_new = &fallback_doms; + cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; + } + /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { @@ -7707,17 +7699,10 @@ match2: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int arch_reinit_sched_domains(void) { - int err; - get_online_cpus(); - mutex_lock(&sched_domains_mutex); - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); - err = arch_init_sched_domains(&cpu_online_map); - mutex_unlock(&sched_domains_mutex); + rebuild_sched_domains(); put_online_cpus(); - - return err; + return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +#ifndef CONFIG_CPUSETS /* - * Force a reinitialization of the sched domains hierarchy. The domains - * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to the NULL domain - * which will prevent rebalancing while the sched domains are recalculated. + * Add online and remove offline CPUs from the scheduler domains. + * When cpusets are enabled they take over this function. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + partition_sched_domains(0, NULL, NULL); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} +#endif + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) { int cpu = (int)(long)hcpu; @@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: disable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - detach_destroy_domains(&cpu_online_map); - free_sched_domains(); return NOTIFY_OK; - case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: enable_runtime(cpu_rq(cpu)); - /* fall-through */ - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* - * Fall through and re-initialise the domains. - */ - break; + return NOTIFY_OK; + default: return NOTIFY_DONE; } - -#ifndef CONFIG_CPUSETS - /* - * Create default domain partitioning if cpusets are disabled. - * Otherwise we let cpusets rebuild the domains based on the - * current setup. - */ - - /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(&cpu_online_map); -#endif - - return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7855,8 +7830,15 @@ void __init sched_init_smp(void) cpu_set(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); put_online_cpus(); + +#ifndef CONFIG_CPUSETS /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); +#endif + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2aa987027d6..d924c679dfac 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq) * not idle and an idle cpu is available. The span of cpus to * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. + * Domains may include CPUs that are not usable for migration, + * hence we need to mask them out (cpu_active_map) * * Returns the CPU we should wake onto. */ @@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); + cpus_and(tmp, tmp, cpu_active_map); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) { if (i != task_cpu(p)) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d3d1cccb3d7b..50735bb96149 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ + /* + * Only consider CPUs that are usable for migration. + * I guess we might want to change cpupri_find() to ignore those + * in the first place. + */ + cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); + /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect -- cgit v1.2.3 From 39b0fad7121eace85770e7a4c6dc35dfd2879768 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 15 Jul 2008 20:56:26 -0700 Subject: cpu hotplug: Make cpu_active_map synchronization dependency clear This goes on top of the cpu_active_map (take 2) patch. Currently we depend on the stop_machine to provide nescessesary synchronization for the cpu_active_map updates. As Dmitry Adamushko pointed this is fragile and is not much clearer than the previous scheme. In other words we do not want to depend on the internal stop machine operation here. So make the synchronization rules clear by doing synchronize_sched() after clearing out cpu active bit. Tested on quad-Core2 with: while true; do for i in 1 2 3; do echo 0 > /sys/devices/system/cpu/cpu$i/online done for i in 1 2 3; do echo 1 > /sys/devices/system/cpu/cpu$i/online done done and stress -c 200 No lockdep, preempt or other complaints. Signed-off-by: Max Krasnyansky Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/cpu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index a1ac7ea245d7..033603c1d7c3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -301,6 +301,16 @@ int __ref cpu_down(unsigned int cpu) cpu_clear(cpu, cpu_active_map); + /* + * Make sure the all cpus did the reschedule and are not + * using stale version of the cpu_active_map. + * This is not strictly necessary becuase stop_machine() + * that we run down the line already provides the required + * synchronization. But it's really a side effect and we do not + * want to depend on the innards of the stop_machine here. + */ + synchronize_sched(); + err = _cpu_down(cpu, 0); if (cpu_online(cpu)) -- cgit v1.2.3 From 3da1c84c00c7e5fa8348336bd8c342f9128b0f14 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 25 Jul 2008 01:47:50 -0700 Subject: workqueues: make get_online_cpus() useable for work->func() workqueue_cpu_callback(CPU_DEAD) flushes cwq->thread under cpu_maps_update_begin(). This means that the multithreaded workqueues can't use get_online_cpus() due to the possible deadlock, very bad and very old problem. Introduce the new state, CPU_POST_DEAD, which is called after cpu_hotplug_done() but before cpu_maps_update_done(). Change workqueue_cpu_callback() to use CPU_POST_DEAD instead of CPU_DEAD. This means that create/destroy functions can't rely on get_online_cpus() any longer and should take cpu_add_remove_lock instead. [akpm@linux-foundation.org: fix CONFIG_SMP=n] Signed-off-by: Oleg Nesterov Acked-by: Gautham R Shenoy Cc: Heiko Carstens Cc: Max Krasnyansky Cc: Paul Jackson Cc: Paul Menage Cc: Peter Zijlstra Cc: Vegard Nossum Cc: Martin Schwidefsky Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 15 +++++++++++---- include/linux/notifier.h | 2 ++ kernel/cpu.c | 5 +++++ kernel/workqueue.c | 18 +++++++++--------- 4 files changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7464ba3b4333..d7faf8808497 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -69,10 +69,11 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb) #endif int cpu_up(unsigned int cpu); - extern void cpu_hotplug_init(void); +extern void cpu_maps_update_begin(void); +extern void cpu_maps_update_done(void); -#else +#else /* CONFIG_SMP */ static inline int register_cpu_notifier(struct notifier_block *nb) { @@ -87,10 +88,16 @@ static inline void cpu_hotplug_init(void) { } +static inline void cpu_maps_update_begin(void) +{ +} + +static inline void cpu_maps_update_done(void) +{ +} + #endif /* CONFIG_SMP */ extern struct sysdev_class cpu_sysdev_class; -extern void cpu_maps_update_begin(void); -extern void cpu_maps_update_done(void); #ifdef CONFIG_HOTPLUG_CPU /* Stop CPUs going up and down. */ diff --git a/include/linux/notifier.h b/include/linux/notifier.h index bd3d72ddf333..da2698b0fdd1 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -214,6 +214,8 @@ static inline int notifier_to_errno(int ret) #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_DYING 0x0008 /* CPU (unsigned)v not running any task, * not handling interrupts, soon dead */ +#define CPU_POST_DEAD 0x0009 /* CPU (unsigned)v dead, cpu_hotplug + * lock is dropped */ /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend * operation in progress diff --git a/kernel/cpu.c b/kernel/cpu.c index 2cc409ce0a8f..10ba5f1004a5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -285,6 +285,11 @@ out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: cpu_hotplug_done(); + if (!err) { + if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, + hcpu) == NOTIFY_BAD) + BUG(); + } return err; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5fbffd302eb5..828e58230cbc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -828,7 +828,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, err = create_workqueue_thread(cwq, singlethread_cpu); start_workqueue_thread(cwq, -1); } else { - get_online_cpus(); + cpu_maps_update_begin(); spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); @@ -840,7 +840,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, err = create_workqueue_thread(cwq, cpu); start_workqueue_thread(cwq, cpu); } - put_online_cpus(); + cpu_maps_update_done(); } if (err) { @@ -854,8 +854,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key); static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* - * Our caller is either destroy_workqueue() or CPU_DEAD, - * get_online_cpus() protects cwq->thread. + * Our caller is either destroy_workqueue() or CPU_POST_DEAD, + * cpu_add_remove_lock protects cwq->thread. */ if (cwq->thread == NULL) return; @@ -865,7 +865,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) flush_cpu_workqueue(cwq); /* - * If the caller is CPU_DEAD and cwq->worklist was not empty, + * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, * a concurrent flush_workqueue() can insert a barrier after us. * However, in that case run_workqueue() won't return and check * kthread_should_stop() until it flushes all work_struct's. @@ -889,14 +889,14 @@ void destroy_workqueue(struct workqueue_struct *wq) const cpumask_t *cpu_map = wq_cpu_map(wq); int cpu; - get_online_cpus(); + cpu_maps_update_begin(); spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); for_each_cpu_mask_nr(cpu, *cpu_map) cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - put_online_cpus(); + cpu_maps_update_done(); free_percpu(wq->cpu_wq); kfree(wq); @@ -935,7 +935,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: start_workqueue_thread(cwq, -1); - case CPU_DEAD: + case CPU_POST_DEAD: cleanup_workqueue_thread(cwq); break; } @@ -943,7 +943,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: cpu_clear(cpu, cpu_populated_map); } -- cgit v1.2.3 From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:29 -0700 Subject: cpumask: make cpumask_of_cpu_map generic If an arch doesn't define cpumask_of_cpu_map, create a generic statically-initialized one for them. This allows removal of the buggy cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of out-of-scope var). An arch with NR_CPUS of 4096 probably wants to allocate this itself based on the actual number of CPUs, since otherwise they're using 2MB of rodata (1024 cpus means 128k). That's what CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the moment). In future as we support more CPUs, we'll need to resort to a get_cpu_map()/put_cpu_map() allocation scheme. Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Jack Steiner Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 41 ++---------------- kernel/cpu.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 38 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1b5c98e7fef7..8fa3b6d4a320 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -62,15 +62,7 @@ * int next_cpu_nr(cpu, mask) Next cpu past 'cpu', or nr_cpu_ids * * cpumask_t cpumask_of_cpu(cpu) Return cpumask with bit 'cpu' set - *ifdef CONFIG_HAS_CPUMASK_OF_CPU - * cpumask_of_cpu_ptr_declare(v) Declares cpumask_t *v - * cpumask_of_cpu_ptr_next(v, cpu) Sets v = &cpumask_of_cpu_map[cpu] - * cpumask_of_cpu_ptr(v, cpu) Combines above two operations - *else - * cpumask_of_cpu_ptr_declare(v) Declares cpumask_t _v and *v = &_v - * cpumask_of_cpu_ptr_next(v, cpu) Sets _v = cpumask_of_cpu(cpu) - * cpumask_of_cpu_ptr(v, cpu) Combines above two operations - *endif + * (can be used as an lvalue) * CPU_MASK_ALL Initializer - all bits set * CPU_MASK_NONE Initializer - no bits set * unsigned long *cpus_addr(mask) Array of unsigned long's in mask @@ -274,36 +266,9 @@ static inline void __cpus_shift_left(cpumask_t *dstp, } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -extern cpumask_t *cpumask_of_cpu_map; +/* cpumask_of_cpu_map[] is in kernel/cpu.c */ +extern const cpumask_t *cpumask_of_cpu_map; #define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu]) -#define cpumask_of_cpu_ptr(v, cpu) \ - const cpumask_t *v = &cpumask_of_cpu(cpu) -#define cpumask_of_cpu_ptr_declare(v) \ - const cpumask_t *v -#define cpumask_of_cpu_ptr_next(v, cpu) \ - v = &cpumask_of_cpu(cpu) -#else -#define cpumask_of_cpu(cpu) \ -({ \ - typeof(_unused_cpumask_arg_) m; \ - if (sizeof(m) == sizeof(unsigned long)) { \ - m.bits[0] = 1UL<<(cpu); \ - } else { \ - cpus_clear(m); \ - cpu_set((cpu), m); \ - } \ - m; \ -}) -#define cpumask_of_cpu_ptr(v, cpu) \ - cpumask_t _##v = cpumask_of_cpu(cpu); \ - const cpumask_t *v = &_##v -#define cpumask_of_cpu_ptr_declare(v) \ - cpumask_t _##v; \ - const cpumask_t *v = &_##v -#define cpumask_of_cpu_ptr_next(v, cpu) \ - _##v = cpumask_of_cpu(cpu) -#endif #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..fe31ff3d3809 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -461,3 +461,112 @@ out: #endif /* CONFIG_PM_SLEEP_SMP */ #endif /* CONFIG_SMP */ + +#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +/* 64 bits of zeros, for initializers. */ +#if BITS_PER_LONG == 32 +#define Z64 0, 0 +#else +#define Z64 0 +#endif + +/* Initializer macros. */ +#define CMI0(n) { .bits = { 1UL << (n) } } +#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } } + +#define CMI8(n, ...) \ + CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__), \ + CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__), \ + CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__), \ + CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__) + +#if BITS_PER_LONG == 32 +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__), \ + CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__) +#else +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__), \ + CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__) +#endif + +#define CMI256(n, ...) \ + CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__), \ + CMI64((n)+128, Z64, Z64, __VA_ARGS__), \ + CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__) +#define Z256 Z64, Z64, Z64, Z64 + +#define CMI1024(n, ...) \ + CMI256((n), __VA_ARGS__), \ + CMI256((n)+256, Z256, __VA_ARGS__), \ + CMI256((n)+512, Z256, Z256, __VA_ARGS__), \ + CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__) +#define Z1024 Z256, Z256, Z256, Z256 + +/* We want this statically initialized, just to be safe. We try not + * to waste too much space, either. */ +static const cpumask_t cpumask_map[] = { + CMI0(0), CMI0(1), CMI0(2), CMI0(3), +#if NR_CPUS > 4 + CMI0(4), CMI0(5), CMI0(6), CMI0(7), +#endif +#if NR_CPUS > 8 + CMI0(8), CMI0(9), CMI0(10), CMI0(11), + CMI0(12), CMI0(13), CMI0(14), CMI0(15), +#endif +#if NR_CPUS > 16 + CMI0(16), CMI0(17), CMI0(18), CMI0(19), + CMI0(20), CMI0(21), CMI0(22), CMI0(23), + CMI0(24), CMI0(25), CMI0(26), CMI0(27), + CMI0(28), CMI0(29), CMI0(30), CMI0(31), +#endif +#if NR_CPUS > 32 +#if BITS_PER_LONG == 32 + CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), + CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), + CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), + CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), + CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), + CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), + CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), + CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), +#else + CMI0(32), CMI0(33), CMI0(34), CMI0(35), + CMI0(36), CMI0(37), CMI0(38), CMI0(39), + CMI0(40), CMI0(41), CMI0(42), CMI0(43), + CMI0(44), CMI0(45), CMI0(46), CMI0(47), + CMI0(48), CMI0(49), CMI0(50), CMI0(51), + CMI0(52), CMI0(53), CMI0(54), CMI0(55), + CMI0(56), CMI0(57), CMI0(58), CMI0(59), + CMI0(60), CMI0(61), CMI0(62), CMI0(63), +#endif /* BITS_PER_LONG == 64 */ +#endif +#if NR_CPUS > 64 + CMI64(64, Z64), +#endif +#if NR_CPUS > 128 + CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), +#endif +#if NR_CPUS > 256 + CMI256(256, Z256), +#endif +#if NR_CPUS > 512 + CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), +#endif +#if NR_CPUS > 1024 + CMI1024(1024, Z1024), +#endif +#if NR_CPUS > 2048 + CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), +#endif +#if NR_CPUS > 4096 +#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +#endif +}; + +const cpumask_t *cpumask_of_cpu_map = cpumask_map; +#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 10 ++++++---- kernel/cpu.c | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/cpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c006..1cd53dfcd309 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -81,10 +81,12 @@ static void __init setup_per_cpu_maps(void) } #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ +/* + * Replace static cpumask_of_cpu_map in the initdata section, + * with one that's allocated sized by the possible number of cpus. + * + * (requires nr_cpu_ids to be initialized) + */ static void __init setup_cpumask_of_cpu(void) { int i; diff --git a/kernel/cpu.c b/kernel/cpu.c index fe31ff3d3809..9d4e1c28c053 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,6 @@ out: #endif /* CONFIG_SMP */ -#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP /* 64 bits of zeros, for initializers. */ #if BITS_PER_LONG == 32 #define Z64 0, 0 @@ -509,7 +508,11 @@ out: /* We want this statically initialized, just to be safe. We try not * to waste too much space, either. */ -static const cpumask_t cpumask_map[] = { +static const cpumask_t cpumask_map[] +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +__initdata +#endif += { CMI0(0), CMI0(1), CMI0(2), CMI0(3), #if NR_CPUS > 4 CMI0(4), CMI0(5), CMI0(6), CMI0(7), @@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = { }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; -#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 16:50:47 +0200 Subject: cpumask: export cpumask_of_cpu_map fix: ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9d4e1c28c053..a35d8995dc8c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -572,3 +572,5 @@ __initdata }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; + +EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); -- cgit v1.2.3 From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:28 -0500 Subject: Simplify stop_machine stop_machine creates a kthread which creates kernel threads. We can create those threads directly and simplify things a little. Some care must be taken with CPU hotunplug, which has special needs, but that code seems more robust than it was in the past. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- include/linux/stop_machine.h | 20 ++- kernel/cpu.c | 13 +- kernel/stop_machine.c | 293 ++++++++++++++++++------------------------- 3 files changed, 136 insertions(+), 190 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 18af011c13af..36c2c7284eb3 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -17,13 +17,12 @@ * @data: the data ptr for the @fn() * @cpu: if @cpu == n, run @fn() on cpu n * if @cpu == NR_CPUS, run @fn() on any cpu - * if @cpu == ALL_CPUS, run @fn() first on the calling cpu, and then - * concurrently on all the other cpus + * if @cpu == ALL_CPUS, run @fn() on every online CPU. * - * Description: This causes a thread to be scheduled on every other cpu, - * each of which disables interrupts, and finally interrupts are disabled - * on the current CPU. The result is that noone is holding a spinlock - * or inside any other preempt-disabled region when @fn() runs. + * Description: This causes a thread to be scheduled on every cpu, + * each of which disables interrupts. The result is that noone is + * holding a spinlock or inside any other preempt-disabled region when + * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ @@ -35,13 +34,10 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); * @data: the data ptr for the @fn * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS. * - * Description: This is a special version of the above, which returns the - * thread which has run @fn(): kthread_stop will return the return value - * of @fn(). Used by hotplug cpu. + * Description: This is a special version of the above, which assumes cpus + * won't come or go while it's being called. Used by hotplug cpu. */ -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu); - +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); #else static inline int stop_machine_run(int (*fn)(void *), void *data, diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..cf79bb911371 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - if (IS_ERR(p) || cpu_online(cpu)) { + if (err || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } /* Wait for it to sleep (leaving idle task). */ @@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a473bd0cb71b..35882dccc943 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,4 +1,4 @@ -/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. +/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ #include @@ -13,220 +13,177 @@ #include #include -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ STOPMACHINE_PREPARE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, }; +static enum stopmachine_state state; struct stop_machine_data { int (*fn)(void *); void *data; - struct completion done; - int run_all; -} smdata; + int fnret; +}; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; +static DEFINE_MUTEX(lock); -static int stopmachine(void *cpu) +static void set_state(enum stopmachine_state newstate) { - int irqs_disabled = 0; - int prepared = 0; - int ran = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - - set_cpus_allowed_ptr(current, cpumask); - - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); - - /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { - smdata.fn(smdata.data); - ran = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); - - return 0; + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); + smp_wmb(); + state = newstate; } -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) { - atomic_set(&stopmachine_thread_ack, 0); - smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + if (atomic_dec_and_test(&thread_ack)) { + /* If we're the last one to ack the EXIT, we're finished. */ + if (state == STOPMACHINE_EXIT) + complete(&finished); + else + set_state(state + 1); + } } -static int stop_machine(void) +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int uninitialized_var(ret); - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } - - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { - yield(); + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ cpu_relax(); - } - - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } - - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); - - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); - - return 0; -} + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); + } + } while (curstate != STOPMACHINE_EXIT); -static void restart_machine(void) -{ - stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + do_exit(0); } -static void run_other_cpus(void) +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) { - stopmachine_set_state(STOPMACHINE_RUN); + return 0; } -static int do_stop(void *_smdata) +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct stop_machine_data *smdata = _smdata; - int ret; + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; + + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; + + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + cpu = any_online_cpu(cpu_online_map); + + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; + + /* Set up initial state. */ + mutex_lock(&lock); + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_PREPARE); - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - if (smdata->run_all) - run_other_cpus(); - restart_machine(); - } + for_each_online_cpu(i) { + struct stop_machine_data *smdata; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + if (cpu == ALL_CPUS || i == cpu) + smdata = &active; + else + smdata = &idle; + + threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", + i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return ret; -} + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; + /* Make it highest prio. */ + if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) + BUG(); + } - mutex_lock(&stopmachine_mutex); + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + cpu = get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - smdata.fn = fn; - smdata.data = data; - smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; - init_completion(&smdata.done); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); + mutex_unlock(&lock); - smp_wmb(); /* make sure other cpus see smdata updates */ + kfree(threads); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS || cpu == ALL_CPUS) - cpu = raw_smp_processor_id(); + return active.fnret; - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + mutex_unlock(&lock); - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; + kfree(threads); + return err; } int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine_run(fn, data, cpu); put_online_cpus(); return ret; -- cgit v1.2.3 From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:29 -0500 Subject: Hotplug CPU: don't check cpu_online after take_cpu_down Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline. Remove the cpu_online() check, and put a BUG_ON(). Quoting Akinobu Mita: Actually the cpu_online() check was necessary before appling this stop_machine: simplify patch. With old __stop_machine_run(), __stop_machine_run() could succeed (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value. The return value of take_cpu_down() was obtained through kthread_stop().. Signed-off-by: Rusty Russell Cc: "Akinobu Mita" --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index cf79bb911371..53cf508f975a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed_ptr(current, &tmp); err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - - if (err || cpu_online(cpu)) { + if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_allowed; } + BUG_ON(cpu_online(cpu)); /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine(): stop_machine_run() changed to use cpu mask Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all cpus), pass a cpumask_t. Allow NULL for the common case (where we don't care which CPU the function is run on): temporary cpumask_t's are usually considered bad for stack space. This deprecates stop_machine_run, to be removed soon when all the callers are dead. Signed-off-by: Rusty Russell --- include/linux/stop_machine.h | 34 ++++++++++++++++++++++++---------- kernel/cpu.c | 3 ++- kernel/stop_machine.c | 27 +++++++++++++-------------- 3 files changed, 39 insertions(+), 25 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 36c2c7284eb3..f1cb0ba6d715 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -5,19 +5,19 @@ (and more). So the "read" side to such a lock is anything which diables preeempt. */ #include +#include #include #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +/* Deprecated, but useful for transition. */ #define ALL_CPUS ~0U /** - * stop_machine_run: freeze the machine on all CPUs and run this function + * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr for the @fn() - * @cpu: if @cpu == n, run @fn() on cpu n - * if @cpu == NR_CPUS, run @fn() on any cpu - * if @cpu == ALL_CPUS, run @fn() on every online CPU. + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * Description: This causes a thread to be scheduled on every cpu, * each of which disables interrupts. The result is that noone is @@ -26,22 +26,22 @@ * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); /** - * __stop_machine_run: freeze the machine on all CPUs and run this function + * __stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr for the @fn - * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS. + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * Description: This is a special version of the above, which assumes cpus * won't come or go while it's being called. Used by hotplug cpu. */ -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); #else -static inline int stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) +static inline int stop_machine(int (*fn)(void *), void *data, + const cpumask_t *cpus) { int ret; local_irq_disable(); @@ -50,4 +50,18 @@ static inline int stop_machine_run(int (*fn)(void *), void *data, return ret; } #endif /* CONFIG_SMP */ + +static inline int __deprecated stop_machine_run(int (*fn)(void *), void *data, + unsigned int cpu) +{ + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + return stop_machine(fn, data, NULL); + else if (cpu == ~0U) + return stop_machine(fn, data, &cpu_possible_map); + else { + cpumask_t cpus = cpumask_of_cpu(cpu); + return stop_machine(fn, data, &cpus); + } +} #endif /* _LINUX_STOP_MACHINE */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 53cf508f975a..29510d68338a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpus_setall(tmp); cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); + tmp = cpumask_of_cpu(cpu); - err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine(take_cpu_down, &tcd_param, &tmp); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 35882dccc943..e446c7c7d6a9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int chill(void *unused) return 0; } -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int i, err; struct stop_machine_data active, idle; @@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) idle.fn = chill; idle.data = NULL; - /* If they don't care which cpu fn runs on, just pick one. */ - if (cpu == NR_CPUS) - cpu = any_online_cpu(cpu_online_map); - /* This could be too big for stack on large machines. */ threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); if (!threads) @@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) set_state(STOPMACHINE_PREPARE); for_each_online_cpu(i) { - struct stop_machine_data *smdata; + struct stop_machine_data *smdata = &idle; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - if (cpu == ALL_CPUS || i == cpu) - smdata = &active; - else - smdata = &idle; + if (!cpus) { + if (i == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(i, *cpus)) + smdata = &active; + } threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", i); @@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) /* We've created all the threads. Wake them all: hold this CPU so one * doesn't hit this CPU until we're ready. */ - cpu = get_cpu(); + get_cpu(); for_each_online_cpu(i) wake_up_process(threads[i]); @@ -177,15 +176,15 @@ kill_threads: return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - ret = __stop_machine_run(fn, data, cpu); + ret = __stop_machine(fn, data, cpus); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.2.3 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 23 -------- include/linux/cpumask.h | 26 ++++++++- kernel/cpu.c | 128 +++++++---------------------------------- 3 files changed, 43 insertions(+), 134 deletions(-) (limited to 'kernel/cpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1cd53dfcd309..76e305e064f9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -/* - * Replace static cpumask_of_cpu_map in the initdata section, - * with one that's allocated sized by the possible number of cpus. - * - * (requires nr_cpu_ids to be initialized) - */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 8fa3b6d4a320..96d0509fb8d8 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -265,10 +265,30 @@ static inline void __cpus_shift_left(cpumask_t *dstp, bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } +/* + * Special-case data structure for "single bit set only" constant CPU masks. + * + * We pre-generate all the 64 (or 32) possible bit positions, with enough + * padding to the left and the right, and return the constant pointer + * appropriately offset. + */ +extern const unsigned long + cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; + +static inline const cpumask_t *get_cpu_mask(unsigned int cpu) +{ + const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; + p -= cpu / BITS_PER_LONG; + return (const cpumask_t *)p; +} + +/* + * In cases where we take the address of the cpumask immediately, + * gcc optimizes it out (it's a constant) and there's no huge stack + * variable created: + */ +#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); }) -/* cpumask_of_cpu_map[] is in kernel/cpu.c */ -extern const cpumask_t *cpumask_of_cpu_map; -#define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu]) #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) diff --git a/kernel/cpu.c b/kernel/cpu.c index a35d8995dc8c..06a8358bb418 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,115 +462,27 @@ out: #endif /* CONFIG_SMP */ -/* 64 bits of zeros, for initializers. */ -#if BITS_PER_LONG == 32 -#define Z64 0, 0 -#else -#define Z64 0 -#endif +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1< 4 - CMI0(4), CMI0(5), CMI0(6), CMI0(7), -#endif -#if NR_CPUS > 8 - CMI0(8), CMI0(9), CMI0(10), CMI0(11), - CMI0(12), CMI0(13), CMI0(14), CMI0(15), -#endif -#if NR_CPUS > 16 - CMI0(16), CMI0(17), CMI0(18), CMI0(19), - CMI0(20), CMI0(21), CMI0(22), CMI0(23), - CMI0(24), CMI0(25), CMI0(26), CMI0(27), - CMI0(28), CMI0(29), CMI0(30), CMI0(31), -#endif -#if NR_CPUS > 32 -#if BITS_PER_LONG == 32 - CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), - CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), - CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), - CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), - CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), - CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), - CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), - CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), -#else - CMI0(32), CMI0(33), CMI0(34), CMI0(35), - CMI0(36), CMI0(37), CMI0(38), CMI0(39), - CMI0(40), CMI0(41), CMI0(42), CMI0(43), - CMI0(44), CMI0(45), CMI0(46), CMI0(47), - CMI0(48), CMI0(49), CMI0(50), CMI0(51), - CMI0(52), CMI0(53), CMI0(54), CMI0(55), - CMI0(56), CMI0(57), CMI0(58), CMI0(59), - CMI0(60), CMI0(61), CMI0(62), CMI0(63), -#endif /* BITS_PER_LONG == 64 */ -#endif -#if NR_CPUS > 64 - CMI64(64, Z64), -#endif -#if NR_CPUS > 128 - CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), -#endif -#if NR_CPUS > 256 - CMI256(256, Z256), -#endif -#if NR_CPUS > 512 - CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), -#endif -#if NR_CPUS > 1024 - CMI1024(1024, Z1024), -#endif -#if NR_CPUS > 2048 - CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), -#endif -#if NR_CPUS > 4096 -#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; - -const cpumask_t *cpumask_of_cpu_map = cpumask_map; - -EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); -- cgit v1.2.3 From 279ef6bbb8308488398c8f33b04c760148428378 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 30 Jul 2008 12:34:04 +0200 Subject: sched, cpu hotplug: fix set_cpus_allowed() use in hotplug callbacks Mark Langsdorf reported: > One of my co-workers noticed that the powernow-k8 > driver no longer restarts when a CPU core is > hot-disabled and then hot-enabled on AMD quad-core > systems. > > The following comands work fine on 2.6.26 and fail > on 2.6.27-rc1: > > echo 0 > /sys/devices/system/cpu/cpu3/online > echo 1 > /sys/devices/system/cpu/cpu3/online > find /sys -name cpufreq > > For 2.6.26, the find will return a cpufreq > directory for each processor. In 2.6.27-rc1, > the cpu3 directory is missing. > > After digging through the code, the following > logic is failing when the core is hot-enabled > at runtime. The code works during the boot > sequence. > > cpumask_t = current->cpus_allowed; > set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); > if (smp_processor_id() != cpu) > return -ENODEV; So set the CPU active before calling the CPU_ONLINE notifier chain, there are a handful of notifiers that use set_cpus_allowed(). This fix also solves the problem with x86-microcode. I've sent alternative patches for microcode, but as this "rely on set_cpus_allowed_ptr() being workable in cpu-hotplug(CPU_ONLINE, ...)" assumption seems to be more broad than what we thought, perhaps this fix should be applied. With this patch we define that by the moment CPU_ONLINE is being sent, a 'cpu' is online and ready for tasks to be migrated onto it. Signed-off-by: Dmitry Adamushko Reported-by: Mark Langsdorf Tested-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index e202a68d1cc1..c977c339f559 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -349,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + cpu_set(cpu, cpu_active_map); + /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); @@ -383,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu) err = _cpu_up(cpu, 0); - if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); - out: cpu_maps_update_done(); return err; -- cgit v1.2.3 From 3ee1062b4ee82a56294808a065b64f4bc6781a56 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Aug 2008 15:08:40 -0700 Subject: cpu hotplug: s390 doesn't support additional_cpus anymore. s390 doesn't support the additional_cpus kernel parameter anymore since a long time. So we better update the code and documentation to reflect that. Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cpu-hotplug.txt | 5 ----- kernel/cpu.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/cpu.c') diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index ba0aacde94fb..94bbc27ddd4f 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -59,15 +59,10 @@ apicid values in those tables for disabled apics. In the event BIOS doesn't mark such hot-pluggable cpus as disabled entries, one could use this parameter "additional_cpus=x" to represent those cpus in the cpu_possible_map. -s390 uses the number of cpus it detects at IPL time to also the number of bits -in cpu_possible_map. If it is desired to add additional cpus at a later time -the number should be specified using this option or the possible_cpus option. - possible_cpus=n [s390 only] use this to set hotpluggable cpus. This option sets possible_cpus bits in cpu_possible_map. Thus keeping the numbers of bits set constant even if the machine gets rebooted. - This option overrides additional_cpus. CPU maps and such ----------------- diff --git a/kernel/cpu.c b/kernel/cpu.c index c977c339f559..f17e9854c246 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -369,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_isset(cpu, cpu_possible_map)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif -- cgit v1.2.3