diff options
-rw-r--r-- | arch/ia64/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/s390/kvm/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 1 | ||||
-rw-r--r-- | drivers/acpi/osl.c | 41 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 4 | ||||
-rw-r--r-- | include/linux/preempt.h | 43 | ||||
-rw-r--r-- | include/linux/sched.h | 59 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 6 | ||||
-rw-r--r-- | include/linux/workqueue.h | 96 | ||||
-rw-r--r-- | init/Kconfig | 3 | ||||
-rw-r--r-- | init/main.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 334 | ||||
-rw-r--r-- | kernel/stop_machine.c | 151 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 4 | ||||
-rw-r--r-- | kernel/workqueue.c | 412 | ||||
-rw-r--r-- | lib/Kconfig.debug | 8 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 28 |
19 files changed, 795 insertions, 404 deletions
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig index ef3e7be29caf..a9e2b9c60d65 100644 --- a/arch/ia64/kvm/Kconfig +++ b/arch/ia64/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM depends on HAVE_KVM && MODULES && EXPERIMENTAL # for device assignment: depends on PCI - select PREEMPT_NOTIFIERS select ANON_INODES select HAVE_KVM_IRQCHIP select KVM_APIC_ARCHITECTURE diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 07703f72330e..38818c08a33c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -18,7 +18,6 @@ if VIRTUALIZATION config KVM bool - select PREEMPT_NOTIFIERS select ANON_INODES config KVM_BOOK3S_64_HANDLER diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 6ee55ae84ce2..f9b46b07a0a1 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -18,7 +18,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM && EXPERIMENTAL - select PREEMPT_NOTIFIERS select ANON_INODES ---help--- Support hosting paravirtualized guest machines using the SIE diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..ba43dfed353d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -687,7 +687,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK(&c_idle.work, do_fork_idle); + INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -713,6 +713,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); + destroy_work_on_stack(&c_idle.work); return PTR_ERR(c_idle.idle); } @@ -831,6 +832,7 @@ do_rest: smpboot_restore_warm_reset_vector(); } + destroy_work_on_stack(&c_idle.work); return boot_error; } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3c4d0109ad20..f5a136a65638 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM depends on HAVE_KVM # for device assignment: depends on PCI - select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES select HAVE_KVM_IRQCHIP diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 02e8464e480f..93f664733d3f 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -191,36 +191,11 @@ acpi_status __init acpi_os_initialize(void) return AE_OK; } -static void bind_to_cpu0(struct work_struct *work) -{ - set_cpus_allowed_ptr(current, cpumask_of(0)); - kfree(work); -} - -static void bind_workqueue(struct workqueue_struct *wq) -{ - struct work_struct *work; - - work = kzalloc(sizeof(struct work_struct), GFP_KERNEL); - INIT_WORK(work, bind_to_cpu0); - queue_work(wq, work); -} - acpi_status acpi_os_initialize1(void) { - /* - * On some machines, a software-initiated SMI causes corruption unless - * the SMI runs on CPU 0. An SMI can be initiated by any AML, but - * typically it's done in GPE-related methods that are run via - * workqueues, so we can avoid the known corruption cases by binding - * the workqueues to CPU 0. - */ - kacpid_wq = create_singlethread_workqueue("kacpid"); - bind_workqueue(kacpid_wq); - kacpi_notify_wq = create_singlethread_workqueue("kacpi_notify"); - bind_workqueue(kacpi_notify_wq); - kacpi_hotplug_wq = create_singlethread_workqueue("kacpi_hotplug"); - bind_workqueue(kacpi_hotplug_wq); + kacpid_wq = create_workqueue("kacpid"); + kacpi_notify_wq = create_workqueue("kacpi_notify"); + kacpi_hotplug_wq = create_workqueue("kacpi_hotplug"); BUG_ON(!kacpid_wq); BUG_ON(!kacpi_notify_wq); BUG_ON(!kacpi_hotplug_wq); @@ -759,7 +734,15 @@ static acpi_status __acpi_os_execute(acpi_execute_type type, (type == OSL_NOTIFY_HANDLER ? kacpi_notify_wq : kacpid_wq); dpc->wait = hp ? 1 : 0; INIT_WORK(&dpc->work, acpi_os_execute_deferred); - ret = queue_work(queue, &dpc->work); + + /* + * On some machines, a software-initiated SMI causes corruption unless + * the SMI runs on CPU 0. An SMI can be initiated by any AML, but + * typically it's done in GPE-related methods that are run via + * workqueues, so we can avoid the known corruption cases by always + * queueing on CPU 0. + */ + ret = queue_work_on(0, queue, &dpc->work); if (!ret) { printk(KERN_ERR PREFIX diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bd5a616d9373..45b631e9dc3b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -74,9 +74,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus, struct kvm_vcpu { struct kvm *kvm; -#ifdef CONFIG_PREEMPT_NOTIFIERS - struct preempt_notifier preempt_notifier; -#endif + struct sched_notifier sched_notifier; int vcpu_id; struct mutex mutex; int cpu; diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 72b1a10a59b6..538c675b979d 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -93,47 +93,4 @@ do { \ #endif -#ifdef CONFIG_PREEMPT_NOTIFIERS - -struct preempt_notifier; - -/** - * preempt_ops - notifiers called when a task is preempted and rescheduled - * @sched_in: we're about to be rescheduled: - * notifier: struct preempt_notifier for the task being scheduled - * cpu: cpu we're scheduled on - * @sched_out: we've just been preempted - * notifier: struct preempt_notifier for the task being preempted - * next: the task that's kicking us out - */ -struct preempt_ops { - void (*sched_in)(struct preempt_notifier *notifier, int cpu); - void (*sched_out)(struct preempt_notifier *notifier, - struct task_struct *next); -}; - -/** - * preempt_notifier - key for installing preemption notifiers - * @link: internal use - * @ops: defines the notifier functions to be called - * - * Usually used in conjunction with container_of(). - */ -struct preempt_notifier { - struct hlist_node link; - struct preempt_ops *ops; -}; - -void preempt_notifier_register(struct preempt_notifier *notifier); -void preempt_notifier_unregister(struct preempt_notifier *notifier); - -static inline void preempt_notifier_init(struct preempt_notifier *notifier, - struct preempt_ops *ops) -{ - INIT_HLIST_NODE(¬ifier->link); - notifier->ops = ops; -} - -#endif - #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1cddd4230868..e3c45ab0ad04 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1214,6 +1214,50 @@ struct sched_rt_entity { #endif }; +struct sched_notifier; + +/** + * sched_notifier_ops - notifiers called for scheduling events + * @wakeup: we're waking up + * notifier: struct sched_notifier for the task being woken up + * @sleep: we're going to bed + * notifier: struct sched_notifier for the task sleeping + * @in: we're now running on the cpu + * notifier: struct sched_notifier for the task being scheduled in + * prev: the task which ran before us + * @out: we're leaving the cpu + * notifier: struct sched_notifier for the task being scheduled out + * next: the task which will run after us + */ +struct sched_notifier_ops { + void (*wakeup)(struct sched_notifier *notifier); + void (*sleep)(struct sched_notifier *notifier); + void (*in)(struct sched_notifier *notifier, struct task_struct *prev); + void (*out)(struct sched_notifier *notifier, struct task_struct *next); +}; + +/** + * sched_notifier - key for installing scheduler notifiers + * @link: internal use + * @ops: defines the notifier functions to be called + * + * Usually used in conjunction with container_of(). + */ +struct sched_notifier { + struct hlist_node link; + struct sched_notifier_ops *ops; +}; + +void sched_notifier_register(struct sched_notifier *notifier); +void sched_notifier_unregister(struct sched_notifier *notifier); + +static inline void sched_notifier_init(struct sched_notifier *notifier, + struct sched_notifier_ops *ops) +{ + INIT_HLIST_NODE(¬ifier->link); + notifier->ops = ops; +} + struct rcu_node; struct task_struct { @@ -1237,10 +1281,8 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; -#ifdef CONFIG_PREEMPT_NOTIFIERS - /* list of struct preempt_notifier: */ - struct hlist_head preempt_notifiers; -#endif + /* list of struct sched_notifier: */ + struct hlist_head sched_notifiers; /* * fpu_counter contains the number of consecutive context switches @@ -1811,6 +1853,8 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_SMP extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); +extern int force_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask); #else static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) @@ -1819,6 +1863,11 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, return -EINVAL; return 0; } +static inline int force_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask) +{ + return set_cpus_allowed_ptr(p, new_mask); +} #endif #ifndef CONFIG_CPUMASK_OFFSTACK @@ -2017,6 +2066,8 @@ extern void release_uids(struct user_namespace *ns); extern void do_timer(unsigned long ticks); +extern bool try_to_wake_up_local(struct task_struct *p, unsigned int state, + int wake_flags); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk, diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index baba3a23a814..2d32e061bdc4 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -53,6 +53,11 @@ int stop_machine_create(void); */ void stop_machine_destroy(void); +/** + * init_stop_machine: initialize stop_machine during boot + */ +void init_stop_machine(void); + #else static inline int stop_machine(int (*fn)(void *), void *data, @@ -67,6 +72,7 @@ static inline int stop_machine(int (*fn)(void *), void *data, static inline int stop_machine_create(void) { return 0; } static inline void stop_machine_destroy(void) { } +static inline void init_stop_machine(void) { } #endif /* CONFIG_SMP */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index cf24c20de9e4..8e689d1f91d8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -22,11 +22,26 @@ typedef void (*work_func_t)(struct work_struct *work); */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) +enum { + WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ + WORK_STRUCT_STATIC_BIT = 1, /* static initializer (debugobjects) */ + + WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, + + /* + * Reserve 3bits off of cwq pointer. This is enough and + * provides acceptable alignment on both 32 and 64bit + * machines. + */ + WORK_STRUCT_FLAG_BITS = 3, + + WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, + WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, +}; + struct work_struct { atomic_long_t data; -#define WORK_STRUCT_PENDING 0 /* T if work item pending execution */ -#define WORK_STRUCT_FLAG_MASK (3UL) -#define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) struct list_head entry; work_func_t func; #ifdef CONFIG_LOCKDEP @@ -35,6 +50,7 @@ struct work_struct { }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT(0) +#define WORK_DATA_STATIC_INIT() ATOMIC_LONG_INIT(WORK_STRUCT_STATIC) struct delayed_work { struct work_struct work; @@ -63,7 +79,7 @@ struct execute_work { #endif #define __WORK_INITIALIZER(n, f) { \ - .data = WORK_DATA_INIT(), \ + .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ @@ -91,6 +107,19 @@ struct execute_work { #define PREPARE_DELAYED_WORK(_work, _func) \ PREPARE_WORK(&(_work)->work, (_func)) +#ifdef CONFIG_DEBUG_OBJECTS_WORK +extern void __init_work(struct work_struct *work, int onstack); +extern void destroy_work_on_stack(struct work_struct *work); +static inline bool work_static(struct work_struct *work) +{ + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); +} +#else +static inline void __init_work(struct work_struct *work, int onstack) { } +static inline void destroy_work_on_stack(struct work_struct *work) { } +static inline bool work_static(struct work_struct *work) { return false; } +#endif + /* * initialize all of a work item in one go * @@ -99,24 +128,36 @@ struct execute_work { * to generate better code. */ #ifdef CONFIG_LOCKDEP -#define INIT_WORK(_work, _func) \ +#define __INIT_WORK(_work, _func, _onstack) \ do { \ static struct lock_class_key __key; \ \ + __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0);\ INIT_LIST_HEAD(&(_work)->entry); \ PREPARE_WORK((_work), (_func)); \ } while (0) #else -#define INIT_WORK(_work, _func) \ +#define __INIT_WORK(_work, _func, _onstack) \ do { \ + __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ PREPARE_WORK((_work), (_func)); \ } while (0) #endif +#define INIT_WORK(_work, _func) \ + do { \ + __INIT_WORK((_work), (_func), 0); \ + } while (0) + +#define INIT_WORK_ON_STACK(_work, _func) \ + do { \ + __INIT_WORK((_work), (_func), 1); \ + } while (0) + #define INIT_DELAYED_WORK(_work, _func) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ @@ -125,28 +166,22 @@ struct execute_work { #define INIT_DELAYED_WORK_ON_STACK(_work, _func) \ do { \ - INIT_WORK(&(_work)->work, (_func)); \ + INIT_WORK_ON_STACK(&(_work)->work, (_func)); \ init_timer_on_stack(&(_work)->timer); \ } while (0) -#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ +#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ init_timer_deferrable(&(_work)->timer); \ } while (0) -#define INIT_DELAYED_WORK_ON_STACK(_work, _func) \ - do { \ - INIT_WORK(&(_work)->work, (_func)); \ - init_timer_on_stack(&(_work)->timer); \ - } while (0) - /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ - test_bit(WORK_STRUCT_PENDING, work_data_bits(work)) + test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently @@ -161,16 +196,19 @@ struct execute_work { * @work: The work item in question */ #define work_clear_pending(work) \ - clear_bit(WORK_STRUCT_PENDING, work_data_bits(work)) + clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) +enum { + WQ_FREEZEABLE = 1 << 0, /* freeze during suspend */ + WQ_SINGLE_THREAD = 1 << 1, /* no per-cpu worker */ +}; extern struct workqueue_struct * -__create_workqueue_key(const char *name, int singlethread, - int freezeable, int rt, struct lock_class_key *key, - const char *lock_name); +__create_workqueue_key(const char *name, unsigned int flags, + struct lock_class_key *key, const char *lock_name); #ifdef CONFIG_LOCKDEP -#define __create_workqueue(name, singlethread, freezeable, rt) \ +#define __create_workqueue(name, flags) \ ({ \ static struct lock_class_key __key; \ const char *__lock_name; \ @@ -180,20 +218,20 @@ __create_workqueue_key(const char *name, int singlethread, else \ __lock_name = #name; \ \ - __create_workqueue_key((name), (singlethread), \ - (freezeable), (rt), &__key, \ + __create_workqueue_key((name), (flags), &__key, \ __lock_name); \ }) #else -#define __create_workqueue(name, singlethread, freezeable, rt) \ - __create_workqueue_key((name), (singlethread), (freezeable), (rt), \ - NULL, NULL) +#define __create_workqueue(name, flags) \ + __create_workqueue_key((name), (flags), NULL, NULL) #endif -#define create_workqueue(name) __create_workqueue((name), 0, 0, 0) -#define create_rt_workqueue(name) __create_workqueue((name), 0, 0, 1) -#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1, 0) -#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0, 0) +#define create_workqueue(name) \ + __create_workqueue((name), 0) +#define create_freezeable_workqueue(name) \ + __create_workqueue((name), WQ_FREEZEABLE | WQ_SINGLE_THREAD) +#define create_singlethread_workqueue(name) \ + __create_workqueue((name), WQ_SINGLE_THREAD) extern void destroy_workqueue(struct workqueue_struct *wq); diff --git a/init/Kconfig b/init/Kconfig index 94a8e10d6290..1b532005aede 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1235,7 +1235,4 @@ config STOP_MACHINE source "block/Kconfig" -config PREEMPT_NOTIFIERS - bool - source "kernel/Kconfig.locks" diff --git a/init/main.c b/init/main.c index 698066ff8762..43807f7dcf5f 100644 --- a/init/main.c +++ b/init/main.c @@ -34,6 +34,7 @@ #include <linux/security.h> #include <linux/smp.h> #include <linux/workqueue.h> +#include <linux/stop_machine.h> #include <linux/profile.h> #include <linux/rcupdate.h> #include <linux/moduleparam.h> @@ -779,6 +780,7 @@ static void __init do_initcalls(void) static void __init do_basic_setup(void) { init_workqueues(); + init_stop_machine(); cpuset_init_smp(); usermodehelper_init(); init_tmpfs(); diff --git a/kernel/sched.c b/kernel/sched.c index dba459d08916..be7592902610 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1389,6 +1389,16 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; +#define fire_sched_notifier(p, callback, args...) do { \ + struct task_struct *__p = (p); \ + struct sched_notifier *__sn; \ + struct hlist_node *__pos; \ + \ + hlist_for_each_entry(__sn, __pos, &__p->sched_notifiers, link) \ + if (__sn->ops->callback) \ + __sn->ops->callback(__sn , ##args); \ +} while (0) + static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); /* @@ -2097,6 +2107,7 @@ struct migration_req { struct task_struct *task; int dest_cpu; + bool force; struct completion done; }; @@ -2105,8 +2116,8 @@ struct migration_req { * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +static int migrate_task(struct task_struct *p, int dest_cpu, + struct migration_req *req, bool force) { struct rq *rq = task_rq(p); @@ -2123,6 +2134,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) init_completion(&req->done); req->task = p; req->dest_cpu = dest_cpu; + req->force = force; list_add(&req->list, &rq->migration_queue); return 1; @@ -2323,11 +2335,73 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local) +{ + schedstat_inc(p, se.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.nr_wakeups_local); + else + schedstat_inc(p, se.nr_wakeups_remote); + + activate_task(rq, p, 1); + + /* + * Only attribute actual wakeups done by this task. + */ + if (!in_interrupt()) { + struct sched_entity *se = ¤t->se; + u64 sample = se->sum_exec_runtime; + + if (se->last_wakeup) + sample -= se->last_wakeup; + else + sample -= se->start_runtime; + update_avg(&se->avg_wakeup, sample); + + se->last_wakeup = se->sum_exec_runtime; + } +} + +static inline void ttwu_woken_up(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(rq, p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_wake_up) + p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* + * Wake up is complete, fire wake up notifier. This allows + * try_to_wake_up_local() to be called from wake up notifiers. + */ + if (success) + fire_sched_notifier(p, wakeup); +} + +/** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2335,7 +2409,8 @@ void task_oncpu_function_call(struct task_struct *p, * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2406,57 +2481,61 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.nr_wakeups_local); - else - schedstat_inc(p, se.nr_wakeups_remote); - activate_task(rq, p, 1); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu); success = 1; +out_running: + ttwu_woken_up(p, rq, wake_flags, success); +out: + task_rq_unlock(rq, &flags); + put_cpu(); - /* - * Only attribute actual wakeups done by this task. - */ - if (!in_interrupt()) { - struct sched_entity *se = ¤t->se; - u64 sample = se->sum_exec_runtime; - - if (se->last_wakeup) - sample -= se->last_wakeup; - else - sample -= se->start_runtime; - update_avg(&se->avg_wakeup, sample); + return success; +} - se->last_wakeup = se->sum_exec_runtime; - } +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and @p is + * not the current task. this_rq() stays locked over invocation. + * + * This function can be called from wakeup and sleep scheduler + * notifiers. Be careful not to create deep recursion by chaining + * wakeup notifiers. + * + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. + */ +bool try_to_wake_up_local(struct task_struct *p, unsigned int state, + int wake_flags) +{ + struct rq *rq = task_rq(p); + bool success = false; -out_running: - trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, wake_flags); + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (!sched_feat(SYNC_WAKEUPS)) + wake_flags &= ~WF_SYNC; - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + if (!(p->state & state)) + return false; - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, wake_flags & WF_SYNC, false, true); + success = true; } -#endif -out: - task_rq_unlock(rq, &flags); - put_cpu(); + + ttwu_woken_up(p, rq, wake_flags, success); return success; } @@ -2538,10 +2617,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; INIT_LIST_HEAD(&p->se.group_node); - -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif + INIT_HLIST_HEAD(&p->sched_notifiers); /* * We mark the process as running here, but have not actually @@ -2651,63 +2727,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) task_rq_unlock(rq, &flags); } -#ifdef CONFIG_PREEMPT_NOTIFIERS - /** - * preempt_notifier_register - tell me when current is being preempted & rescheduled + * sched_notifier_register - register scheduler notifier * @notifier: notifier struct to register */ -void preempt_notifier_register(struct preempt_notifier *notifier) +void sched_notifier_register(struct sched_notifier *notifier) { - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); + hlist_add_head(¬ifier->link, ¤t->sched_notifiers); } -EXPORT_SYMBOL_GPL(preempt_notifier_register); +EXPORT_SYMBOL_GPL(sched_notifier_register); /** - * preempt_notifier_unregister - no longer interested in preemption notifications + * sched_notifier_unregister - unregister scheduler notifier * @notifier: notifier struct to unregister * - * This is safe to call from within a preemption notifier. + * This is safe to call from within a scheduler notifier. */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) +void sched_notifier_unregister(struct sched_notifier *notifier) { hlist_del(¬ifier->link); } -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ - struct preempt_notifier *notifier; - struct hlist_node *node; - - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); -} - -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) -{ -} - -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +EXPORT_SYMBOL_GPL(sched_notifier_unregister); /** * prepare_task_switch - prepare to switch tasks @@ -2726,7 +2766,7 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - fire_sched_out_preempt_notifiers(prev, next); + fire_sched_notifier(current, out, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -2768,7 +2808,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); perf_event_task_sched_in(current, cpu_of(rq)); - fire_sched_in_preempt_notifiers(current); + fire_sched_notifier(current, in, prev); finish_lock_switch(rq, prev); if (mm) @@ -3133,7 +3173,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) goto out; /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { + if (migrate_task(p, dest_cpu, &req, false)) { /* Need to wait for migration thread (might exit: take ref). */ struct task_struct *mt = rq->migration_thread; @@ -5446,10 +5486,17 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * Fire sleep notifier before changing any scheduler + * state. This allows try_to_wake_up_local() to be + * called from sleep notifiers. + */ + fire_sched_notifier(prev, sleep); deactivate_task(rq, prev, 1); + } switch_count = &prev->nvcsw; } @@ -7039,34 +7086,19 @@ static inline void sched_init_granularity(void) * 7) we wake up and the migration is done. */ -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. - */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static inline int __set_cpus_allowed(struct task_struct *p, + const struct cpumask *new_mask, + struct rq *rq, unsigned long *flags, + bool force) { struct migration_req req; - unsigned long flags; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); if (!cpumask_intersects(new_mask, cpu_online_mask)) { ret = -EINVAL; goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && - !cpumask_equal(&p->cpus_allowed, new_mask))) { - ret = -EINVAL; - goto out; - } - if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { @@ -7078,12 +7110,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req, + force)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; get_task_struct(mt); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, flags); wake_up_process(rq->migration_thread); put_task_struct(mt); wait_for_completion(&req.done); @@ -7091,13 +7124,54 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) return 0; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, flags); return ret; } + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpumask_equal(&p->cpus_allowed, new_mask))) { + task_rq_unlock(rq, &flags); + return -EINVAL; + } + + return __set_cpus_allowed(p, new_mask, rq, &flags, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* + * Similar to set_cpus_allowed_ptr() but bypasses PF_THREAD_BOUND + * check and ignores cpu_active() status as long as the cpu is online. + * The caller is responsible for guaranteeing that the destination + * cpus don't go down until this function finishes and in general + * ensuring things don't go bonkers. + */ +int force_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(p, &flags); + return __set_cpus_allowed(p, new_mask, rq, &flags, true); +} + +/* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're @@ -7108,12 +7182,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); * * Returns non-zero if task was successfully migrated. */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu, + bool force) { struct rq *rq_dest, *rq_src; int ret = 0, on_rq; - if (unlikely(!cpu_active(dest_cpu))) + if (!force && unlikely(!cpu_active(dest_cpu))) return ret; rq_src = cpu_rq(src_cpu); @@ -7192,7 +7267,8 @@ static int migration_thread(void *data) if (req->task != NULL) { spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); + __migrate_task(req->task, cpu, req->dest_cpu, + req->force); } else if (likely(cpu == (badcpu = smp_processor_id()))) { req->dest_cpu = RCU_MIGRATION_GOT_QS; spin_unlock(&rq->lock); @@ -7217,7 +7293,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) int ret; local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); + ret = __migrate_task(p, src_cpu, dest_cpu, false); local_irq_enable(); return ret; } @@ -9569,9 +9645,7 @@ void __init sched_init(void) set_load_weight(&init_task); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&init_task.preempt_notifiers); -#endif + INIT_HLIST_HEAD(&init_task.sched_notifiers); #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e2a11b..671a4ac00fba 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -25,6 +25,8 @@ enum stopmachine_state { STOPMACHINE_RUN, /* Exit */ STOPMACHINE_EXIT, + /* Done */ + STOPMACHINE_DONE, }; static enum stopmachine_state state; @@ -42,10 +44,9 @@ static DEFINE_MUTEX(lock); static DEFINE_MUTEX(setup_lock); /* Users of stop_machine. */ static int refcount; -static struct workqueue_struct *stop_machine_wq; +static struct task_struct **stop_machine_threads; static struct stop_machine_data active, idle; static const struct cpumask *active_cpus; -static void *stop_machine_work; static void set_state(enum stopmachine_state newstate) { @@ -63,14 +64,31 @@ static void ack_state(void) } /* This is the actual function which stops the CPU. It runs - * in the context of a dedicated stopmachine workqueue. */ -static void stop_cpu(struct work_struct *unused) + * on dedicated per-cpu kthreads. */ +static int stop_cpu(void *unused) { enum stopmachine_state curstate = STOPMACHINE_NONE; - struct stop_machine_data *smdata = &idle; + struct stop_machine_data *smdata; int cpu = smp_processor_id(); int err; +repeat: + /* Wait for __stop_machine() to initiate */ + while (true) { + set_current_state(TASK_INTERRUPTIBLE); + /* <- kthread_stop() and __stop_machine()::smp_wmb() */ + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + if (state == STOPMACHINE_PREPARE) + break; + schedule(); + } + smp_rmb(); /* <- __stop_machine()::set_state() */ + + /* Okay, let's go */ + smdata = &idle; if (!active_cpus) { if (cpu == cpumask_first(cpu_online_mask)) smdata = &active; @@ -104,6 +122,7 @@ static void stop_cpu(struct work_struct *unused) } while (curstate != STOPMACHINE_EXIT); local_irq_enable(); + goto repeat; } /* Callback for CPUs which aren't supposed to do anything. */ @@ -112,46 +131,122 @@ static int chill(void *unused) return 0; } +static int create_stop_machine_thread(unsigned int cpu) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct task_struct **pp = per_cpu_ptr(stop_machine_threads, cpu); + struct task_struct *p; + + if (*pp) + return -EBUSY; + + p = kthread_create(stop_cpu, NULL, "kstop/%u", cpu); + if (IS_ERR(p)) + return PTR_ERR(p); + + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + *pp = p; + return 0; +} + +/* Should be called with cpu hotplug disabled and setup_lock held */ +static void kill_stop_machine_threads(void) +{ + unsigned int cpu; + + if (!stop_machine_threads) + return; + + for_each_online_cpu(cpu) { + struct task_struct *p = *per_cpu_ptr(stop_machine_threads, cpu); + if (p) + kthread_stop(p); + } + free_percpu(stop_machine_threads); + stop_machine_threads = NULL; +} + int stop_machine_create(void) { + unsigned int cpu; + + get_online_cpus(); mutex_lock(&setup_lock); if (refcount) goto done; - stop_machine_wq = create_rt_workqueue("kstop"); - if (!stop_machine_wq) - goto err_out; - stop_machine_work = alloc_percpu(struct work_struct); - if (!stop_machine_work) + + stop_machine_threads = alloc_percpu(struct task_struct *); + if (!stop_machine_threads) goto err_out; + + /* + * cpu hotplug is disabled, create only for online cpus, + * cpu_callback() will handle cpu hot [un]plugs. + */ + for_each_online_cpu(cpu) { + if (create_stop_machine_thread(cpu)) + goto err_out; + kthread_bind(*per_cpu_ptr(stop_machine_threads, cpu), cpu); + } done: refcount++; mutex_unlock(&setup_lock); + put_online_cpus(); return 0; err_out: - if (stop_machine_wq) - destroy_workqueue(stop_machine_wq); + kill_stop_machine_threads(); mutex_unlock(&setup_lock); + put_online_cpus(); return -ENOMEM; } EXPORT_SYMBOL_GPL(stop_machine_create); void stop_machine_destroy(void) { + get_online_cpus(); mutex_lock(&setup_lock); - refcount--; - if (refcount) - goto done; - destroy_workqueue(stop_machine_wq); - free_percpu(stop_machine_work); -done: + if (!--refcount) + kill_stop_machine_threads(); mutex_unlock(&setup_lock); + put_online_cpus(); } EXPORT_SYMBOL_GPL(stop_machine_destroy); +static int __cpuinit stop_machine_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct task_struct **pp = per_cpu_ptr(stop_machine_threads, cpu); + + /* Hotplug exclusion is enough, no need to worry about setup_lock */ + if (!stop_machine_threads) + return NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (create_stop_machine_thread(cpu)) { + printk(KERN_ERR "failed to create stop machine " + "thread for %u\n", cpu); + return NOTIFY_BAD; + } + break; + + case CPU_ONLINE: + kthread_bind(*pp, cpu); + break; + + case CPU_UP_CANCELED: + case CPU_POST_DEAD: + kthread_stop(*pp); + *pp = NULL; + break; + } + return NOTIFY_OK; +} + int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) { - struct work_struct *sm_work; int i, ret; /* Set up initial state. */ @@ -164,19 +259,18 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) idle.fn = chill; idle.data = NULL; - set_state(STOPMACHINE_PREPARE); + set_state(STOPMACHINE_PREPARE); /* -> stop_cpu()::smp_rmb() */ + smp_wmb(); /* -> stop_cpu()::set_current_state() */ /* Schedule the stop_cpu work on all cpus: hold this CPU so one * doesn't hit this CPU until we're ready. */ get_cpu(); - for_each_online_cpu(i) { - sm_work = per_cpu_ptr(stop_machine_work, i); - INIT_WORK(sm_work, stop_cpu); - queue_work_on(i, stop_machine_wq, sm_work); - } + for_each_online_cpu(i) + wake_up_process(*per_cpu_ptr(stop_machine_threads, i)); /* This will release the thread on our CPU. */ put_cpu(); - flush_workqueue(stop_machine_wq); + while (state < STOPMACHINE_DONE) + yield(); ret = active.fnret; mutex_unlock(&lock); return ret; @@ -197,3 +291,8 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) return ret; } EXPORT_SYMBOL_GPL(stop_machine); + +void __init init_stop_machine(void) +{ + hotcpu_notifier(stop_machine_cpu_callback, 0); +} diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d006554888dc..de0bfeb5e95f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -414,7 +414,9 @@ config KMEMTRACE If unsure, say N. config WORKQUEUE_TRACER - bool "Trace workqueues" +# Temporarily disabled during workqueue reimplementation +# bool "Trace workqueues" + def_bool n select GENERIC_TRACER help The workqueue tracer provides some statistical informations diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 67e526b6ae81..5392939cb1e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,12 +33,22 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> -#define CREATE_TRACE_POINTS -#include <trace/events/workqueue.h> + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Set during initialization and read-only afterwards. + * + * L: cwq->lock protected. Access with cwq->lock held. + * + * W: workqueue_lock protected. + */ /* * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). + * possible cpu). The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. */ struct cpu_workqueue_struct { @@ -48,26 +58,134 @@ struct cpu_workqueue_struct { wait_queue_head_t more_work; struct work_struct *current_work; - struct workqueue_struct *wq; - struct task_struct *thread; -} ____cacheline_aligned; + struct workqueue_struct *wq; /* I: the owning workqueue */ + struct task_struct *thread; +} __attribute__((aligned(1 << WORK_STRUCT_FLAG_BITS))); /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; + unsigned int flags; /* I: WQ_* flags */ + struct cpu_workqueue_struct *cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); @@ -84,9 +202,9 @@ static const struct cpumask *cpu_singlethread_map __read_mostly; static cpumask_var_t cpu_populated_map __read_mostly; /* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) +static inline bool is_wq_single_threaded(struct workqueue_struct *wq) { - return wq->singlethread; + return wq->flags & WQ_SINGLE_THREAD; } static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) @@ -95,8 +213,8 @@ static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) ? cpu_singlethread_map : cpu_populated_map; } -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) { if (unlikely(is_wq_single_threaded(wq))) cpu = singlethread_cpu; @@ -108,45 +226,61 @@ struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) * - Must *only* be called if the pending flag is set */ static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) { - unsigned long new; - BUG_ON(!work_pending(work)); - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); + atomic_long_set(&work->data, (unsigned long)cwq | + (work_static(work) ? WORK_STRUCT_STATIC : 0) | + WORK_STRUCT_PENDING | extra_flags); } -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) { - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + return (void *)(atomic_long_read(&work->data) & + WORK_STRUCT_WQ_DATA_MASK); } +/** + * insert_work - insert a work into cwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work into @cwq after @head. + * + * CONTEXT: + * spin_lock_irq(cwq->lock). + */ static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) { - trace_workqueue_insertion(cwq->thread, work); + /* we own @work, set data and link */ + set_wq_data(work, cwq, extra_flags); - set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); + list_add_tail(&work->entry, head); wake_up(&cwq->more_work); } -static void __queue_work(struct cpu_workqueue_struct *cwq, +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); unsigned long flags; + debug_work_activate(work); spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); + BUG_ON(!list_empty(&work->entry)); + insert_work(cwq, work, &cwq->worklist, 0); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -187,9 +321,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); ret = 1; } return ret; @@ -200,9 +333,8 @@ static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } /** @@ -239,14 +371,14 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + set_wq_data(work, get_cwq(raw_smp_processor_id(), wq), 0); timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -261,60 +393,88 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +/** + * process_one_work - process single work + * @cwq: cwq to process work for + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(cwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work) +{ + work_func_t f = work->func; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* claim and process */ + debug_work_deactivate(work); + cwq->current_work = work; + list_del_init(&work->entry); + + spin_unlock_irq(&cwq->lock); + + BUG_ON(get_wq_data(work) != cwq); + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&cwq->lock); + + /* we're done with it, release */ + cwq->current_work = NULL; +} + static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); while (!list_empty(&cwq->worklist)) { struct work_struct *work = list_entry(cwq->worklist.next, struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. - */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - trace_workqueue_execution(cwq->thread, work); - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); - } - - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; + process_one_work(cwq, work); } spin_unlock_irq(&cwq->lock); } +/** + * worker_thread - the worker thread function + * @__cwq: cwq to serve + * + * The cwq worker thread function. + */ static int worker_thread(void *__cwq) { struct cpu_workqueue_struct *cwq = __cwq; DEFINE_WAIT(wait); - if (cwq->wq->freezeable) + if (cwq->wq->flags & WQ_FREEZEABLE) set_freezable(); for (;;) { @@ -347,15 +507,32 @@ static void wq_barrier_func(struct work_struct *work) complete(&barr->done); } +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @head: insertion point + * + * Insert barrier @barr into @cwq before @head. + * + * CONTEXT: + * spin_lock_irq(cwq->lock). + */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, struct list_head *head) { - INIT_WORK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - + /* + * debugobject calls are safe here even with cwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); - insert_work(cwq, &barr->work, head); + debug_work_activate(&barr->work); + insert_work(cwq, &barr->work, head, 0); } static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) @@ -372,8 +549,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } spin_unlock_irq(&cwq->lock); - if (active) + if (active) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } return active; } @@ -387,9 +566,6 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * * We sleep until all works which were queued on entry have been handled, * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. */ void flush_workqueue(struct workqueue_struct *wq) { @@ -428,7 +604,6 @@ int flush_work(struct work_struct *work) lock_map_acquire(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); - prev = NULL; spin_lock_irq(&cwq->lock); if (!list_empty(&work->entry)) { /* @@ -437,21 +612,22 @@ int flush_work(struct work_struct *work) */ smp_rmb(); if (unlikely(cwq != get_wq_data(work))) - goto out; + goto already_gone; prev = &work->entry; } else { if (cwq->current_work != work) - goto out; + goto already_gone; prev = &cwq->worklist; } insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; + spin_unlock_irq(&cwq->lock); wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); return 1; +already_gone: + spin_unlock_irq(&cwq->lock); + return 0; } EXPORT_SYMBOL_GPL(flush_work); @@ -464,7 +640,7 @@ static int try_to_grab_pending(struct work_struct *work) struct cpu_workqueue_struct *cwq; int ret = -1; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; /* @@ -485,6 +661,7 @@ static int try_to_grab_pending(struct work_struct *work) */ smp_rmb(); if (cwq == get_wq_data(work)) { + debug_work_deactivate(work); list_del_init(&work->entry); ret = 1; } @@ -507,8 +684,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, } spin_unlock_irq(&cwq->lock); - if (unlikely(running)) + if (unlikely(running)) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } } static void wait_on_work(struct work_struct *work) @@ -531,7 +710,7 @@ static void wait_on_work(struct work_struct *work) cpu_map = wq_cpu_map(wq); for_each_cpu(cpu, cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + wait_on_cpu_work(get_cwq(cpu, wq), work); } static int __cancel_work_timer(struct work_struct *work, @@ -648,9 +827,7 @@ EXPORT_SYMBOL(schedule_delayed_work); void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { - struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); - __queue_work(cwq, &dwork->work); + __queue_work(get_cpu(), keventd_wq, &dwork->work); put_cpu(); } flush_work(&dwork->work); @@ -788,7 +965,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct workqueue_struct *wq = cwq->wq; const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; struct task_struct *p; @@ -804,12 +980,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) */ if (IS_ERR(p)) return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; - trace_workqueue_creation(cwq->thread, cpu); - return 0; } @@ -825,9 +997,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) } struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, + unsigned int flags, struct lock_class_key *key, const char *lock_name) { @@ -837,22 +1007,18 @@ struct workqueue_struct *__create_workqueue_key(const char *name, wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) - return NULL; + goto err; wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + if (!wq->cpu_wq) + goto err; + wq->flags = flags; wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; INIT_LIST_HEAD(&wq->list); - if (singlethread) { + if (flags & WQ_SINGLE_THREAD) { cwq = init_cpu_workqueue(wq, singlethread_cpu); err = create_workqueue_thread(cwq, singlethread_cpu); start_workqueue_thread(cwq, -1); @@ -888,6 +1054,12 @@ struct workqueue_struct *__create_workqueue_key(const char *name, wq = NULL; } return wq; +err: + if (wq) { + free_percpu(wq->cpu_wq); + kfree(wq); + } + return NULL; } EXPORT_SYMBOL_GPL(__create_workqueue_key); @@ -914,7 +1086,6 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ - trace_workqueue_destruction(cwq->thread); kthread_stop(cwq->thread); cwq->thread = NULL; } @@ -1043,6 +1214,15 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void __init init_workqueues(void) { + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long in case this code survives for longer + * than twenty years. :-P + */ + BUILD_BUG_ON(__alignof__(struct cpu_workqueue_struct) < + __alignof__(unsigned long long)); + alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); cpumask_copy(cpu_populated_map, cpu_online_mask); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a24efcc3e6f0..62a33f3a2b49 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -298,6 +298,14 @@ config DEBUG_OBJECTS_TIMERS timer routines to track the life time of timer objects and validate the timer operations. +config DEBUG_OBJECTS_WORK + bool "Debug work objects" + depends on DEBUG_OBJECTS + help + If you say Y here, additional code will be inserted into the + work queue routines to track the life time of work objects and + validate the work operations. + config DEBUG_OBJECTS_ENABLE_DEFAULT int "debug_objects bootup default value (0-1)" range 0 1 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e1f2bf8d7b1e..9ebb284403f9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -77,7 +77,7 @@ static atomic_t hardware_enable_failed; struct kmem_cache *kvm_vcpu_cache; EXPORT_SYMBOL_GPL(kvm_vcpu_cache); -static __read_mostly struct preempt_ops kvm_preempt_ops; +static __read_mostly struct sched_notifier_ops kvm_sched_notifier_ops; struct dentry *kvm_debugfs_dir; @@ -109,7 +109,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) mutex_lock(&vcpu->mutex); cpu = get_cpu(); - preempt_notifier_register(&vcpu->preempt_notifier); + sched_notifier_register(&vcpu->sched_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } @@ -118,7 +118,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) { preempt_disable(); kvm_arch_vcpu_put(vcpu); - preempt_notifier_unregister(&vcpu->preempt_notifier); + sched_notifier_unregister(&vcpu->sched_notifier); preempt_enable(); mutex_unlock(&vcpu->mutex); } @@ -1192,7 +1192,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (IS_ERR(vcpu)) return PTR_ERR(vcpu); - preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + sched_notifier_init(&vcpu->sched_notifier, &kvm_sched_notifier_ops); r = kvm_arch_vcpu_setup(vcpu); if (r) @@ -2026,23 +2026,21 @@ static struct sys_device kvm_sysdev = { struct page *bad_page; pfn_t bad_pfn; -static inline -struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) +static inline struct kvm_vcpu *sched_notifier_to_vcpu(struct sched_notifier *sn) { - return container_of(pn, struct kvm_vcpu, preempt_notifier); + return container_of(sn, struct kvm_vcpu, sched_notifier); } -static void kvm_sched_in(struct preempt_notifier *pn, int cpu) +static void kvm_sched_in(struct sched_notifier *sn, struct task_struct *prev) { - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + struct kvm_vcpu *vcpu = sched_notifier_to_vcpu(sn); - kvm_arch_vcpu_load(vcpu, cpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); } -static void kvm_sched_out(struct preempt_notifier *pn, - struct task_struct *next) +static void kvm_sched_out(struct sched_notifier *sn, struct task_struct *next) { - struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); + struct kvm_vcpu *vcpu = sched_notifier_to_vcpu(sn); kvm_arch_vcpu_put(vcpu); } @@ -2115,8 +2113,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size, goto out_free; } - kvm_preempt_ops.sched_in = kvm_sched_in; - kvm_preempt_ops.sched_out = kvm_sched_out; + kvm_sched_notifier_ops.in = kvm_sched_in; + kvm_sched_notifier_ops.out = kvm_sched_out; kvm_init_debug(); |