From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:29 -0700 Subject: cpumask: make cpumask_of_cpu_map generic If an arch doesn't define cpumask_of_cpu_map, create a generic statically-initialized one for them. This allows removal of the buggy cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of out-of-scope var). An arch with NR_CPUS of 4096 probably wants to allocate this itself based on the actual number of CPUs, since otherwise they're using 2MB of rodata (1024 cpus means 128k). That's what CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the moment). In future as we support more CPUs, we'll need to resort to a get_cpu_map()/put_cpu_map() allocation scheme. Signed-off-by: Mike Travis Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Jack Steiner Signed-off-by: Ingo Molnar --- kernel/cpu.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..fe31ff3d3809 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -461,3 +461,112 @@ out: #endif /* CONFIG_PM_SLEEP_SMP */ #endif /* CONFIG_SMP */ + +#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +/* 64 bits of zeros, for initializers. */ +#if BITS_PER_LONG == 32 +#define Z64 0, 0 +#else +#define Z64 0 +#endif + +/* Initializer macros. */ +#define CMI0(n) { .bits = { 1UL << (n) } } +#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } } + +#define CMI8(n, ...) \ + CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__), \ + CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__), \ + CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__), \ + CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__) + +#if BITS_PER_LONG == 32 +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__), \ + CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__) +#else +#define CMI64(n, ...) \ + CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__), \ + CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__), \ + CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__), \ + CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__) +#endif + +#define CMI256(n, ...) \ + CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__), \ + CMI64((n)+128, Z64, Z64, __VA_ARGS__), \ + CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__) +#define Z256 Z64, Z64, Z64, Z64 + +#define CMI1024(n, ...) \ + CMI256((n), __VA_ARGS__), \ + CMI256((n)+256, Z256, __VA_ARGS__), \ + CMI256((n)+512, Z256, Z256, __VA_ARGS__), \ + CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__) +#define Z1024 Z256, Z256, Z256, Z256 + +/* We want this statically initialized, just to be safe. We try not + * to waste too much space, either. */ +static const cpumask_t cpumask_map[] = { + CMI0(0), CMI0(1), CMI0(2), CMI0(3), +#if NR_CPUS > 4 + CMI0(4), CMI0(5), CMI0(6), CMI0(7), +#endif +#if NR_CPUS > 8 + CMI0(8), CMI0(9), CMI0(10), CMI0(11), + CMI0(12), CMI0(13), CMI0(14), CMI0(15), +#endif +#if NR_CPUS > 16 + CMI0(16), CMI0(17), CMI0(18), CMI0(19), + CMI0(20), CMI0(21), CMI0(22), CMI0(23), + CMI0(24), CMI0(25), CMI0(26), CMI0(27), + CMI0(28), CMI0(29), CMI0(30), CMI0(31), +#endif +#if NR_CPUS > 32 +#if BITS_PER_LONG == 32 + CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), + CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), + CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), + CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), + CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), + CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), + CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), + CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), +#else + CMI0(32), CMI0(33), CMI0(34), CMI0(35), + CMI0(36), CMI0(37), CMI0(38), CMI0(39), + CMI0(40), CMI0(41), CMI0(42), CMI0(43), + CMI0(44), CMI0(45), CMI0(46), CMI0(47), + CMI0(48), CMI0(49), CMI0(50), CMI0(51), + CMI0(52), CMI0(53), CMI0(54), CMI0(55), + CMI0(56), CMI0(57), CMI0(58), CMI0(59), + CMI0(60), CMI0(61), CMI0(62), CMI0(63), +#endif /* BITS_PER_LONG == 64 */ +#endif +#if NR_CPUS > 64 + CMI64(64, Z64), +#endif +#if NR_CPUS > 128 + CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), +#endif +#if NR_CPUS > 256 + CMI256(256, Z256), +#endif +#if NR_CPUS > 512 + CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), +#endif +#if NR_CPUS > 1024 + CMI1024(1024, Z1024), +#endif +#if NR_CPUS > 2048 + CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), +#endif +#if NR_CPUS > 4096 +#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +#endif +}; + +const cpumask_t *cpumask_of_cpu_map = cpumask_map; +#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 10 ++++++---- kernel/cpu.c | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/cpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c006..1cd53dfcd309 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -81,10 +81,12 @@ static void __init setup_per_cpu_maps(void) } #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ +/* + * Replace static cpumask_of_cpu_map in the initdata section, + * with one that's allocated sized by the possible number of cpus. + * + * (requires nr_cpu_ids to be initialized) + */ static void __init setup_cpumask_of_cpu(void) { int i; diff --git a/kernel/cpu.c b/kernel/cpu.c index fe31ff3d3809..9d4e1c28c053 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,6 @@ out: #endif /* CONFIG_SMP */ -#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP /* 64 bits of zeros, for initializers. */ #if BITS_PER_LONG == 32 #define Z64 0, 0 @@ -509,7 +508,11 @@ out: /* We want this statically initialized, just to be safe. We try not * to waste too much space, either. */ -static const cpumask_t cpumask_map[] = { +static const cpumask_t cpumask_map[] +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +__initdata +#endif += { CMI0(0), CMI0(1), CMI0(2), CMI0(3), #if NR_CPUS > 4 CMI0(4), CMI0(5), CMI0(6), CMI0(7), @@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = { }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; -#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */ -- cgit v1.2.3 From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 16:50:47 +0200 Subject: cpumask: export cpumask_of_cpu_map fix: ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined! ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9d4e1c28c053..a35d8995dc8c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -572,3 +572,5 @@ __initdata }; const cpumask_t *cpumask_of_cpu_map = cpumask_map; + +EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); -- cgit v1.2.3 From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:28 -0500 Subject: Simplify stop_machine stop_machine creates a kthread which creates kernel threads. We can create those threads directly and simplify things a little. Some care must be taken with CPU hotunplug, which has special needs, but that code seems more robust than it was in the past. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- include/linux/stop_machine.h | 20 ++- kernel/cpu.c | 13 +- kernel/stop_machine.c | 293 ++++++++++++++++++------------------------- 3 files changed, 136 insertions(+), 190 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 18af011c13af..36c2c7284eb3 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -17,13 +17,12 @@ * @data: the data ptr for the @fn() * @cpu: if @cpu == n, run @fn() on cpu n * if @cpu == NR_CPUS, run @fn() on any cpu - * if @cpu == ALL_CPUS, run @fn() first on the calling cpu, and then - * concurrently on all the other cpus + * if @cpu == ALL_CPUS, run @fn() on every online CPU. * - * Description: This causes a thread to be scheduled on every other cpu, - * each of which disables interrupts, and finally interrupts are disabled - * on the current CPU. The result is that noone is holding a spinlock - * or inside any other preempt-disabled region when @fn() runs. + * Description: This causes a thread to be scheduled on every cpu, + * each of which disables interrupts. The result is that noone is + * holding a spinlock or inside any other preempt-disabled region when + * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ @@ -35,13 +34,10 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); * @data: the data ptr for the @fn * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS. * - * Description: This is a special version of the above, which returns the - * thread which has run @fn(): kthread_stop will return the return value - * of @fn(). Used by hotplug cpu. + * Description: This is a special version of the above, which assumes cpus + * won't come or go while it's being called. Used by hotplug cpu. */ -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu); - +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); #else static inline int stop_machine_run(int (*fn)(void *), void *data, diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..cf79bb911371 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - if (IS_ERR(p) || cpu_online(cpu)) { + if (err || cpu_online(cpu)) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } /* Wait for it to sleep (leaving idle task). */ @@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index a473bd0cb71b..35882dccc943 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,4 +1,4 @@ -/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. +/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ #include @@ -13,220 +13,177 @@ #include #include -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ STOPMACHINE_PREPARE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, }; +static enum stopmachine_state state; struct stop_machine_data { int (*fn)(void *); void *data; - struct completion done; - int run_all; -} smdata; + int fnret; +}; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; +static DEFINE_MUTEX(lock); -static int stopmachine(void *cpu) +static void set_state(enum stopmachine_state newstate) { - int irqs_disabled = 0; - int prepared = 0; - int ran = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - - set_cpus_allowed_ptr(current, cpumask); - - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); - - /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_RUN && !ran) { - smdata.fn(smdata.data); - ran = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); - - return 0; + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); + smp_wmb(); + state = newstate; } -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) { - atomic_set(&stopmachine_thread_ack, 0); - smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + if (atomic_dec_and_test(&thread_ack)) { + /* If we're the last one to ack the EXIT, we're finished. */ + if (state == STOPMACHINE_EXIT) + complete(&finished); + else + set_state(state + 1); + } } -static int stop_machine(void) +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int uninitialized_var(ret); - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } - - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { - yield(); + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ cpu_relax(); - } - - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } - - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); - - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); - - return 0; -} + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); + } + } while (curstate != STOPMACHINE_EXIT); -static void restart_machine(void) -{ - stopmachine_set_state(STOPMACHINE_EXIT); local_irq_enable(); - preempt_enable_no_resched(); + do_exit(0); } -static void run_other_cpus(void) +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) { - stopmachine_set_state(STOPMACHINE_RUN); + return 0; } -static int do_stop(void *_smdata) +int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct stop_machine_data *smdata = _smdata; - int ret; + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; + + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; + + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + cpu = any_online_cpu(cpu_online_map); + + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; + + /* Set up initial state. */ + mutex_lock(&lock); + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_PREPARE); - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - if (smdata->run_all) - run_other_cpus(); - restart_machine(); - } + for_each_online_cpu(i) { + struct stop_machine_data *smdata; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + if (cpu == ALL_CPUS || i == cpu) + smdata = &active; + else + smdata = &idle; + + threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", + i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - return ret; -} + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; + /* Make it highest prio. */ + if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) + BUG(); + } - mutex_lock(&stopmachine_mutex); + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + cpu = get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - smdata.fn = fn; - smdata.data = data; - smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0; - init_completion(&smdata.done); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); + mutex_unlock(&lock); - smp_wmb(); /* make sure other cpus see smdata updates */ + kfree(threads); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS || cpu == ALL_CPUS) - cpu = raw_smp_processor_id(); + return active.fnret; - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + mutex_unlock(&lock); - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; + kfree(threads); + return err; } int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine_run(fn, data, cpu); put_online_cpus(); return ret; -- cgit v1.2.3 From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:29 -0500 Subject: Hotplug CPU: don't check cpu_online after take_cpu_down Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline. Remove the cpu_online() check, and put a BUG_ON(). Quoting Akinobu Mita: Actually the cpu_online() check was necessary before appling this stop_machine: simplify patch. With old __stop_machine_run(), __stop_machine_run() could succeed (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value. The return value of take_cpu_down() was obtained through kthread_stop().. Signed-off-by: Rusty Russell Cc: "Akinobu Mita" --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index cf79bb911371..53cf508f975a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) set_cpus_allowed_ptr(current, &tmp); err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - - if (err || cpu_online(cpu)) { + if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_allowed; } + BUG_ON(cpu_online(cpu)); /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) -- cgit v1.2.3 From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 28 Jul 2008 12:16:30 -0500 Subject: stop_machine(): stop_machine_run() changed to use cpu mask Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all cpus), pass a cpumask_t. Allow NULL for the common case (where we don't care which CPU the function is run on): temporary cpumask_t's are usually considered bad for stack space. This deprecates stop_machine_run, to be removed soon when all the callers are dead. Signed-off-by: Rusty Russell --- include/linux/stop_machine.h | 34 ++++++++++++++++++++++++---------- kernel/cpu.c | 3 ++- kernel/stop_machine.c | 27 +++++++++++++-------------- 3 files changed, 39 insertions(+), 25 deletions(-) (limited to 'kernel/cpu.c') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 36c2c7284eb3..f1cb0ba6d715 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -5,19 +5,19 @@ (and more). So the "read" side to such a lock is anything which diables preeempt. */ #include +#include #include #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +/* Deprecated, but useful for transition. */ #define ALL_CPUS ~0U /** - * stop_machine_run: freeze the machine on all CPUs and run this function + * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr for the @fn() - * @cpu: if @cpu == n, run @fn() on cpu n - * if @cpu == NR_CPUS, run @fn() on any cpu - * if @cpu == ALL_CPUS, run @fn() on every online CPU. + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * Description: This causes a thread to be scheduled on every cpu, * each of which disables interrupts. The result is that noone is @@ -26,22 +26,22 @@ * * This can be thought of as a very heavy write lock, equivalent to * grabbing every spinlock in the kernel. */ -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); /** - * __stop_machine_run: freeze the machine on all CPUs and run this function + * __stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr for the @fn - * @cpu: the cpu to run @fn on (or any, if @cpu == NR_CPUS. + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * Description: This is a special version of the above, which assumes cpus * won't come or go while it's being called. Used by hotplug cpu. */ -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu); +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus); #else -static inline int stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) +static inline int stop_machine(int (*fn)(void *), void *data, + const cpumask_t *cpus) { int ret; local_irq_disable(); @@ -50,4 +50,18 @@ static inline int stop_machine_run(int (*fn)(void *), void *data, return ret; } #endif /* CONFIG_SMP */ + +static inline int __deprecated stop_machine_run(int (*fn)(void *), void *data, + unsigned int cpu) +{ + /* If they don't care which cpu fn runs on, just pick one. */ + if (cpu == NR_CPUS) + return stop_machine(fn, data, NULL); + else if (cpu == ~0U) + return stop_machine(fn, data, &cpu_possible_map); + else { + cpumask_t cpus = cpumask_of_cpu(cpu); + return stop_machine(fn, data, &cpus); + } +} #endif /* _LINUX_STOP_MACHINE */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 53cf508f975a..29510d68338a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpus_setall(tmp); cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); + tmp = cpumask_of_cpu(cpu); - err = __stop_machine_run(take_cpu_down, &tcd_param, cpu); + err = __stop_machine(take_cpu_down, &tcd_param, &tmp); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 35882dccc943..e446c7c7d6a9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int chill(void *unused) return 0; } -int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int i, err; struct stop_machine_data active, idle; @@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) idle.fn = chill; idle.data = NULL; - /* If they don't care which cpu fn runs on, just pick one. */ - if (cpu == NR_CPUS) - cpu = any_online_cpu(cpu_online_map); - /* This could be too big for stack on large machines. */ threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); if (!threads) @@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) set_state(STOPMACHINE_PREPARE); for_each_online_cpu(i) { - struct stop_machine_data *smdata; + struct stop_machine_data *smdata = &idle; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - if (cpu == ALL_CPUS || i == cpu) - smdata = &active; - else - smdata = &idle; + if (!cpus) { + if (i == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(i, *cpus)) + smdata = &active; + } threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", i); @@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) /* We've created all the threads. Wake them all: hold this CPU so one * doesn't hit this CPU until we're ready. */ - cpu = get_cpu(); + get_cpu(); for_each_online_cpu(i) wake_up_process(threads[i]); @@ -177,15 +176,15 @@ kill_threads: return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - ret = __stop_machine_run(fn, data, cpu); + ret = __stop_machine(fn, data, cpus); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.2.3 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 23 -------- include/linux/cpumask.h | 26 ++++++++- kernel/cpu.c | 128 +++++++---------------------------------- 3 files changed, 43 insertions(+), 134 deletions(-) (limited to 'kernel/cpu.c') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1cd53dfcd309..76e305e064f9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -/* - * Replace static cpumask_of_cpu_map in the initdata section, - * with one that's allocated sized by the possible number of cpus. - * - * (requires nr_cpu_ids to be initialized) - */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 8fa3b6d4a320..96d0509fb8d8 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -265,10 +265,30 @@ static inline void __cpus_shift_left(cpumask_t *dstp, bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } +/* + * Special-case data structure for "single bit set only" constant CPU masks. + * + * We pre-generate all the 64 (or 32) possible bit positions, with enough + * padding to the left and the right, and return the constant pointer + * appropriately offset. + */ +extern const unsigned long + cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; + +static inline const cpumask_t *get_cpu_mask(unsigned int cpu) +{ + const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; + p -= cpu / BITS_PER_LONG; + return (const cpumask_t *)p; +} + +/* + * In cases where we take the address of the cpumask immediately, + * gcc optimizes it out (it's a constant) and there's no huge stack + * variable created: + */ +#define cpumask_of_cpu(cpu) ({ *get_cpu_mask(cpu); }) -/* cpumask_of_cpu_map[] is in kernel/cpu.c */ -extern const cpumask_t *cpumask_of_cpu_map; -#define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu]) #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) diff --git a/kernel/cpu.c b/kernel/cpu.c index a35d8995dc8c..06a8358bb418 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,115 +462,27 @@ out: #endif /* CONFIG_SMP */ -/* 64 bits of zeros, for initializers. */ -#if BITS_PER_LONG == 32 -#define Z64 0, 0 -#else -#define Z64 0 -#endif +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1< 4 - CMI0(4), CMI0(5), CMI0(6), CMI0(7), -#endif -#if NR_CPUS > 8 - CMI0(8), CMI0(9), CMI0(10), CMI0(11), - CMI0(12), CMI0(13), CMI0(14), CMI0(15), -#endif -#if NR_CPUS > 16 - CMI0(16), CMI0(17), CMI0(18), CMI0(19), - CMI0(20), CMI0(21), CMI0(22), CMI0(23), - CMI0(24), CMI0(25), CMI0(26), CMI0(27), - CMI0(28), CMI0(29), CMI0(30), CMI0(31), -#endif -#if NR_CPUS > 32 -#if BITS_PER_LONG == 32 - CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0), - CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0), - CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0), - CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0), - CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0), - CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0), - CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0), - CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0), -#else - CMI0(32), CMI0(33), CMI0(34), CMI0(35), - CMI0(36), CMI0(37), CMI0(38), CMI0(39), - CMI0(40), CMI0(41), CMI0(42), CMI0(43), - CMI0(44), CMI0(45), CMI0(46), CMI0(47), - CMI0(48), CMI0(49), CMI0(50), CMI0(51), - CMI0(52), CMI0(53), CMI0(54), CMI0(55), - CMI0(56), CMI0(57), CMI0(58), CMI0(59), - CMI0(60), CMI0(61), CMI0(62), CMI0(63), -#endif /* BITS_PER_LONG == 64 */ -#endif -#if NR_CPUS > 64 - CMI64(64, Z64), -#endif -#if NR_CPUS > 128 - CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64), -#endif -#if NR_CPUS > 256 - CMI256(256, Z256), -#endif -#if NR_CPUS > 512 - CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256), -#endif -#if NR_CPUS > 1024 - CMI1024(1024, Z1024), -#endif -#if NR_CPUS > 2048 - CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024), -#endif -#if NR_CPUS > 4096 -#error NR_CPUS too big. Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), #endif }; - -const cpumask_t *cpumask_of_cpu_map = cpumask_map; - -EXPORT_SYMBOL_GPL(cpumask_of_cpu_map); +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); -- cgit v1.2.3