summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-08-19 12:48:09 +0200
committerIngo Molnar <mingo@elte.hu>2010-08-19 12:48:09 +0200
commitc8710ad38900153af7a3e6762e99c062cfa46443 (patch)
treea0c0632274c4eb72f51e99a5861f71cffe65ea60 /kernel
parent6016ee13db518ab1cd0cbf43fc2ad5712021e338 (diff)
parent86397dc3ccfc0e17b7550d05eaf15fe91f6498dd (diff)
Merge branch 'tip/perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/core
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c141
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit.h26
-rw-r--r--kernel/audit_tree.c237
-rw-r--r--kernel/audit_watch.c274
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c19
-rw-r--r--kernel/cgroup.c40
-rw-r--r--kernel/compat.c17
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/gdbstub.c191
-rw-r--r--kernel/debug/kdb/kdb_main.c136
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/exec_domain.c22
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/hw_breakpoint.c12
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kfifo.c750
-rw-r--r--kernel/kthread.c164
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/module.c1088
-rw-r--r--kernel/padata.c755
-rw-r--r--kernel/panic.c60
-rw-r--r--kernel/params.c233
-rw-r--r--kernel/perf_event.c2
-rw-r--r--kernel/pid.c56
-rw-r--r--kernel/pm_qos_params.c215
-rw-r--r--kernel/posix-cpu-timers.c44
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/process.c21
-rw-r--r--kernel/power/snapshot.c3
-rw-r--r--kernel/power/suspend.c13
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/printk.c43
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/range.c4
-rw-r--r--kernel/rcupdate.c160
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/sched.c391
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c532
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/slow-work-debugfs.c227
-rw-r--r--kernel/slow-work.c1068
-rw-r--r--kernel/slow-work.h72
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c202
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c21
-rw-r--r--kernel/time.c16
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/time/timekeeping.c93
-rw-r--r--kernel/timer.c53
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c88
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--kernel/trace/trace.c62
-rw-r--r--kernel/trace/trace.h19
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_events.c188
-rw-r--r--kernel/trace/trace_functions_graph.c10
-rw-r--r--kernel/trace/trace_kdb.c136
-rw-r--r--kernel/user_namespace.c44
-rw-r--r--kernel/workqueue.c3177
-rw-r--r--kernel/workqueue_sched.h9
87 files changed, 7390 insertions, 4228 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ce53fb2bd1d9..0b72d1a74be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,10 +70,11 @@ obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
-obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
+obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
+obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
@@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
-obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 385b88461c29..fa7eb3de2ddc 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(file->f_path.dentry, &sbuf))
+ if (vfs_statfs(&file->f_path, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/async.c b/kernel/async.c
index 15319d6c18fe..cd9dbb913c77 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
-#include <linux/bug.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
-#define MAX_THREADS 256
#define MAX_WORK 32768
static LIST_HEAD(async_pending);
static LIST_HEAD(async_running);
static DEFINE_SPINLOCK(async_lock);
-static int async_enabled = 0;
-
struct async_entry {
- struct list_head list;
- async_cookie_t cookie;
- async_func_ptr *func;
- void *data;
- struct list_head *running;
+ struct list_head list;
+ struct work_struct work;
+ async_cookie_t cookie;
+ async_func_ptr *func;
+ void *data;
+ struct list_head *running;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
-static DECLARE_WAIT_QUEUE_HEAD(async_new);
static atomic_t entry_count;
-static atomic_t thread_count;
extern int initcall_debug;
@@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running)
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
+
/*
* pick the first pending entry and run it
*/
-static void run_one_entry(void)
+static void async_run_entry_fn(struct work_struct *work)
{
+ struct async_entry *entry =
+ container_of(work, struct async_entry, work);
unsigned long flags;
- struct async_entry *entry;
ktime_t calltime, delta, rettime;
- /* 1) pick one task from the pending queue */
-
+ /* 1) move self to the running queue */
spin_lock_irqsave(&async_lock, flags);
- if (list_empty(&async_pending))
- goto out;
- entry = list_first_entry(&async_pending, struct async_entry, list);
-
- /* 2) move it to the running queue */
list_move_tail(&entry->list, entry->running);
spin_unlock_irqrestore(&async_lock, flags);
- /* 3) run it (and print duration)*/
+ /* 2) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
entry->func, task_pid_nr(current));
@@ -153,31 +142,25 @@ static void run_one_entry(void)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 4) remove it from the running queue */
+ /* 3) remove self from the running queue */
spin_lock_irqsave(&async_lock, flags);
list_del(&entry->list);
- /* 5) free the entry */
+ /* 4) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 6) wake up any waiters. */
+ /* 5) wake up any waiters */
wake_up(&async_done);
- return;
-
-out:
- spin_unlock_irqrestore(&async_lock, flags);
}
-
static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
{
struct async_entry *entry;
unsigned long flags;
async_cookie_t newcookie;
-
/* allow irq-off callers */
entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
@@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
* If we're out of memory or if there's too much work
* pending already, we execute synchronously.
*/
- if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
+ if (!entry || atomic_read(&entry_count) > MAX_WORK) {
kfree(entry);
spin_lock_irqsave(&async_lock, flags);
newcookie = next_cookie++;
@@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
ptr(data, newcookie);
return newcookie;
}
+ INIT_WORK(&entry->work, async_run_entry_fn);
entry->func = ptr;
entry->data = data;
entry->running = running;
@@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
list_add_tail(&entry->list, &async_pending);
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- wake_up(&async_new);
+
+ /* schedule for execution */
+ queue_work(system_unbound_wq, &entry->work);
+
return newcookie;
}
@@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie)
async_synchronize_cookie_domain(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
-
-
-static int async_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int ret = HZ;
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * check the list head without lock.. false positives
- * are dealt with inside run_one_entry() while holding
- * the lock.
- */
- rmb();
- if (!list_empty(&async_pending))
- run_one_entry();
- else
- ret = schedule_timeout(HZ);
-
- if (ret == 0) {
- /*
- * we timed out, this means we as thread are redundant.
- * we sign off and die, but we to avoid any races there
- * is a last-straw check to see if work snuck in.
- */
- atomic_dec(&thread_count);
- wmb(); /* manager must see our departure first */
- if (list_empty(&async_pending))
- break;
- /*
- * woops work came in between us timing out and us
- * signing off; we need to stay alive and keep working.
- */
- atomic_inc(&thread_count);
- }
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int async_manager_thread(void *unused)
-{
- DECLARE_WAITQUEUE(wq, current);
- add_wait_queue(&async_new, &wq);
-
- while (!kthread_should_stop()) {
- int tc, ec;
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- tc = atomic_read(&thread_count);
- rmb();
- ec = atomic_read(&entry_count);
-
- while (tc < ec && tc < MAX_THREADS) {
- if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
- tc))) {
- msleep(100);
- continue;
- }
- atomic_inc(&thread_count);
- tc++;
- }
-
- schedule();
- }
- remove_wait_queue(&async_new, &wq);
-
- return 0;
-}
-
-static int __init async_init(void)
-{
- async_enabled =
- !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
-
- WARN_ON(!async_enabled);
- return 0;
-}
-
-core_initcall(async_init);
diff --git a/kernel/audit.c b/kernel/audit.c
index c71bd26631a2..d96045789b54 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,7 +56,6 @@
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
-#include <linux/inotify.h>
#include <linux/freezer.h>
#include <linux/tty.h>
@@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
audit_hold_skb(skb);
} else
/* drop the extra reference if sent ok */
- kfree_skb(skb);
+ consume_skb(skb);
}
static int kauditd_thread(void *dummy)
diff --git a/kernel/audit.h b/kernel/audit.h
index 208687be4f30..f7206db4e13d 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex;
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
+extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
+
/* audit watch functions */
-extern unsigned long audit_watch_inode(struct audit_watch *watch);
-extern dev_t audit_watch_dev(struct audit_watch *watch);
+#ifdef CONFIG_AUDIT_WATCH
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
-extern int audit_add_watch(struct audit_krule *krule);
-extern void audit_remove_watch(struct audit_watch *watch);
-extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list);
-extern void audit_inotify_unregister(struct list_head *in_list);
+extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
+extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern struct list_head *audit_watch_rules(struct audit_watch *watch);
-
-extern struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+#else
+#define audit_put_watch(w) {}
+#define audit_get_watch(w) {}
+#define audit_to_watch(k, p, l, o) (-EINVAL)
+#define audit_add_watch(k, l) (-EINVAL)
+#define audit_remove_watch_rule(k) BUG()
+#define audit_watch_path(w) ""
+#define audit_watch_compare(w, i, d) 0
+
+#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 46a57b57a335..7f18d3a4527e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1,5 +1,5 @@
#include "audit.h"
-#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/kthread.h>
@@ -22,7 +22,7 @@ struct audit_tree {
struct audit_chunk {
struct list_head hash;
- struct inotify_watch watch;
+ struct fsnotify_mark mark;
struct list_head trees; /* with root here */
int dead;
int count;
@@ -59,7 +59,7 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount
* of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
@@ -68,7 +68,7 @@ static LIST_HEAD(prune_list);
* that makes a difference. Some.
*/
-static struct inotify_handle *rtree_ih;
+static struct fsnotify_group *audit_tree_group;
static struct audit_tree *alloc_tree(const char *s)
{
@@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree)
return tree->pathname;
}
-static struct audit_chunk *alloc_chunk(int count)
-{
- struct audit_chunk *chunk;
- size_t size;
- int i;
-
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
- if (!chunk)
- return NULL;
-
- INIT_LIST_HEAD(&chunk->hash);
- INIT_LIST_HEAD(&chunk->trees);
- chunk->count = count;
- atomic_long_set(&chunk->refs, 1);
- for (i = 0; i < count; i++) {
- INIT_LIST_HEAD(&chunk->owners[i].list);
- chunk->owners[i].index = i;
- }
- inotify_init_watch(&chunk->watch);
- return chunk;
-}
-
static void free_chunk(struct audit_chunk *chunk)
{
int i;
@@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu)
audit_put_chunk(chunk);
}
+static void audit_tree_destroy_watch(struct fsnotify_mark *entry)
+{
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
+ call_rcu(&chunk->head, __put_chunk);
+}
+
+static struct audit_chunk *alloc_chunk(int count)
+{
+ struct audit_chunk *chunk;
+ size_t size;
+ int i;
+
+ size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
+ chunk = kzalloc(size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+ }
+ fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ return chunk;
+}
+
enum {HASH_SIZE = 128};
static struct list_head chunk_hash_heads[HASH_SIZE];
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock);
@@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode)
return chunk_hash_heads + n % HASH_SIZE;
}
-/* hash_lock is held by caller */
+/* hash_lock & entry->lock is held by caller */
static void insert_hash(struct audit_chunk *chunk)
{
- struct list_head *list = chunk_hash(chunk->watch.inode);
+ struct fsnotify_mark *entry = &chunk->mark;
+ struct list_head *list;
+
+ if (!entry->i.inode)
+ return;
+ list = chunk_hash(entry->i.inode);
list_add_rcu(&chunk->hash, list);
}
@@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
struct audit_chunk *p;
list_for_each_entry_rcu(p, list, hash) {
- if (p->watch.inode == inode) {
+ /* mark.inode may have gone NULL, but who cares? */
+ if (p->mark.i.inode == inode) {
atomic_long_inc(&p->refs);
return p;
}
@@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p)
static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
+ struct fsnotify_mark *entry = &chunk->mark;
struct audit_chunk *new;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
- if (!pin_inotify_watch(&chunk->watch)) {
- /*
- * Filesystem is shutting down; all watches are getting
- * evicted, just take it off the node list for this
- * tree and let the eviction logics take care of the
- * rest.
- */
- owner = p->owner;
- if (owner->root == chunk) {
- list_del_init(&owner->same_root);
- owner->root = NULL;
- }
- list_del_init(&p->list);
- p->owner = NULL;
- put_tree(owner);
- return;
- }
+ fsnotify_get_mark(entry);
spin_unlock(&hash_lock);
- /*
- * pin_inotify_watch() succeeded, so the watch won't go away
- * from under us.
- */
- mutex_lock(&chunk->watch.inode->inotify_mutex);
- if (chunk->dead) {
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_lock(&entry->lock);
+ if (chunk->dead || !entry->i.inode) {
+ spin_unlock(&entry->lock);
goto out;
}
@@ -256,16 +249,17 @@ static void untag_chunk(struct node *p)
list_del_init(&p->list);
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
goto out;
}
new = alloc_chunk(size);
if (!new)
goto Fallback;
- if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) {
+ fsnotify_duplicate_mark(&new->mark, entry);
+ if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
free_chunk(new);
goto Fallback;
}
@@ -298,9 +292,9 @@ static void untag_chunk(struct node *p)
list_for_each_entry(owner, &new->trees, same_root)
owner->root = new;
spin_unlock(&hash_lock);
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
goto out;
Fallback:
@@ -314,31 +308,33 @@ Fallback:
p->owner = NULL;
put_tree(owner);
spin_unlock(&hash_lock);
- mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ spin_unlock(&entry->lock);
out:
- unpin_inotify_watch(&chunk->watch);
+ fsnotify_put_mark(entry);
spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
{
+ struct fsnotify_mark *entry;
struct audit_chunk *chunk = alloc_chunk(1);
if (!chunk)
return -ENOMEM;
- if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) {
+ entry = &chunk->mark;
+ if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
free_chunk(chunk);
return -ENOSPC;
}
- mutex_lock(&inode->inotify_mutex);
+ spin_lock(&entry->lock);
spin_lock(&hash_lock);
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&entry->lock);
+ fsnotify_destroy_mark(entry);
+ fsnotify_put_mark(entry);
return 0;
}
chunk->owners[0].index = (1U << 31);
@@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
}
insert_hash(chunk);
spin_unlock(&hash_lock);
- mutex_unlock(&inode->inotify_mutex);
+ spin_unlock(&entry->lock);
return 0;
}
/* the first tagged inode becomes root of tree */
static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
- struct inotify_watch *watch;
+ struct fsnotify_mark *old_entry, *chunk_entry;
struct audit_tree *owner;
struct audit_chunk *chunk, *old;
struct node *p;
int n;
- if (inotify_find_watch(rtree_ih, inode, &watch) < 0)
+ old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+ if (!old_entry)
return create_chunk(inode, tree);
- old = container_of(watch, struct audit_chunk, watch);
+ old = container_of(old_entry, struct audit_chunk, mark);
/* are we already there? */
spin_lock(&hash_lock);
for (n = 0; n < old->count; n++) {
if (old->owners[n].owner == tree) {
spin_unlock(&hash_lock);
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return 0;
}
}
@@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk = alloc_chunk(old->count + 1);
if (!chunk) {
- put_inotify_watch(&old->watch);
+ fsnotify_put_mark(old_entry);
return -ENOMEM;
}
- mutex_lock(&inode->inotify_mutex);
- if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
+ chunk_entry = &chunk->mark;
+
+ spin_lock(&old_entry->lock);
+ if (!old_entry->i.inode) {
+ /* old_entry is being shot, lets just lie */
+ spin_unlock(&old_entry->lock);
+ fsnotify_put_mark(old_entry);
free_chunk(chunk);
+ return -ENOENT;
+ }
+
+ fsnotify_duplicate_mark(chunk_entry, old_entry);
+ if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
+ spin_unlock(&old_entry->lock);
+ free_chunk(chunk);
+ fsnotify_put_mark(old_entry);
return -ENOSPC;
}
+
+ /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */
+ spin_lock(&chunk_entry->lock);
spin_lock(&hash_lock);
+
+ /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */
if (tree->goner) {
spin_unlock(&hash_lock);
chunk->dead = 1;
- inotify_evict_watch(&chunk->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch);
- put_inotify_watch(&chunk->watch);
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+
+ fsnotify_destroy_mark(chunk_entry);
+
+ fsnotify_put_mark(chunk_entry);
+ fsnotify_put_mark(old_entry);
return 0;
}
list_replace_init(&old->trees, &chunk->trees);
@@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
list_add(&tree->same_root, &chunk->trees);
}
spin_unlock(&hash_lock);
- inotify_evict_watch(&old->watch);
- mutex_unlock(&inode->inotify_mutex);
- put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
- put_inotify_watch(&old->watch); /* and kill it */
+ spin_unlock(&chunk_entry->lock);
+ spin_unlock(&old_entry->lock);
+ fsnotify_destroy_mark(old_entry);
+ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
+ fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
@@ -584,7 +601,9 @@ void audit_trim_trees(void)
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
- struct inode *inode = find_chunk(node)->watch.inode;
+ struct audit_chunk *chunk = find_chunk(node);
+ /* this could be NULL if the watch is dieing else where... */
+ struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
node->index &= ~(1U<<31);
@@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list)
* Here comes the stuff asynchronous to auditctl operations
*/
-/* inode->inotify_mutex is locked */
static void evict_chunk(struct audit_chunk *chunk)
{
struct audit_tree *owner;
@@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk)
mutex_unlock(&audit_filter_mutex);
}
-static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static int audit_tree_handle_event(struct fsnotify_group *group,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmonut_mark,
+ struct fsnotify_event *event)
+{
+ BUG();
+ return -EOPNOTSUPP;
+}
+
+static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+ struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
- if (mask & IN_IGNORED) {
- evict_chunk(chunk);
- put_inotify_watch(watch);
- }
+ evict_chunk(chunk);
+ fsnotify_put_mark(entry);
}
-static void destroy_watch(struct inotify_watch *watch)
+static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ __u32 mask, void *data, int data_type)
{
- struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- call_rcu(&chunk->head, __put_chunk);
+ return false;
}
-static const struct inotify_operations rtree_inotify_ops = {
- .handle_event = handle_event,
- .destroy_watch = destroy_watch,
+static const struct fsnotify_ops audit_tree_ops = {
+ .handle_event = audit_tree_handle_event,
+ .should_send_event = audit_tree_send_event,
+ .free_group_priv = NULL,
+ .free_event_priv = NULL,
+ .freeing_mark = audit_tree_freeing_mark,
};
static int __init audit_tree_init(void)
{
int i;
- rtree_ih = inotify_init(&rtree_inotify_ops);
- if (IS_ERR(rtree_ih))
- audit_panic("cannot initialize inotify handle for rectree watches");
+ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
for (i = 0; i < HASH_SIZE; i++)
INIT_LIST_HEAD(&chunk_hash_heads[i]);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 8df43696f4ba..f0c9b2e7542d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -24,18 +24,18 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/inotify.h>
#include <linux/security.h>
#include "audit.h"
/*
* Reference counting:
*
- * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED
* event. Each audit_watch holds a reference to its associated parent.
*
* audit_watch: if added to lists, lifetime is from audit_init_watch() to
@@ -51,40 +51,61 @@ struct audit_watch {
unsigned long ino; /* associated inode number */
struct audit_parent *parent; /* associated parent */
struct list_head wlist; /* entry in parent->watches list */
- struct list_head rules; /* associated rules */
+ struct list_head rules; /* anchor for krule->rlist */
};
struct audit_parent {
- struct list_head ilist; /* entry in inotify registration list */
- struct list_head watches; /* associated watches */
- struct inotify_watch wdata; /* inotify watch data */
- unsigned flags; /* status flags */
+ struct list_head watches; /* anchor for audit_watch->wlist */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
};
-/* Inotify handle. */
-struct inotify_handle *audit_ih;
+/* fsnotify handle. */
+struct fsnotify_group *audit_watch_group;
-/*
- * audit_parent status flags:
- *
- * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
- * a filesystem event to ensure we're adding audit watches to a valid parent.
- * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
- * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
- * we can receive while holding nameidata.
- */
-#define AUDIT_PARENT_INVALID 0x001
+/* fsnotify events we care about. */
+#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
-/* Inotify events we care about. */
-#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+static void audit_free_parent(struct audit_parent *parent)
+{
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
-static void audit_free_parent(struct inotify_watch *i_watch)
+static void audit_watch_free_mark(struct fsnotify_mark *entry)
{
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
- WARN_ON(!list_empty(&parent->watches));
- kfree(parent);
+ parent = container_of(entry, struct audit_parent, mark);
+ audit_free_parent(parent);
+}
+
+static void audit_get_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_get_mark(&parent->mark);
+}
+
+static void audit_put_parent(struct audit_parent *parent)
+{
+ if (likely(parent))
+ fsnotify_put_mark(&parent->mark);
+}
+
+/*
+ * Find and return the audit_parent on the given inode. If found a reference
+ * is taken on this parent.
+ */
+static inline struct audit_parent *audit_find_parent(struct inode *inode)
+{
+ struct audit_parent *parent = NULL;
+ struct fsnotify_mark *entry;
+
+ entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+ if (entry)
+ parent = container_of(entry, struct audit_parent, mark);
+
+ return parent;
}
void audit_get_watch(struct audit_watch *watch)
@@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch)
void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
- put_inotify_watch(&watch->parent->wdata);
+ audit_put_parent(watch->parent);
watch->parent = NULL;
audit_put_watch(watch); /* match initial get */
}
@@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch)
return watch->path;
}
-struct list_head *audit_watch_rules(struct audit_watch *watch)
-{
- return &watch->rules;
-}
-
-unsigned long audit_watch_inode(struct audit_watch *watch)
+int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return watch->ino;
-}
-
-dev_t audit_watch_dev(struct audit_watch *watch)
-{
- return watch->dev;
+ return (watch->ino != (unsigned long)-1) &&
+ (watch->ino == ino) &&
+ (watch->dev == dev);
}
/* Initialize a parent watch entry. */
static struct audit_parent *audit_init_parent(struct nameidata *ndp)
{
+ struct inode *inode = ndp->path.dentry->d_inode;
struct audit_parent *parent;
- s32 wd;
+ int ret;
parent = kzalloc(sizeof(*parent), GFP_KERNEL);
if (unlikely(!parent))
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&parent->watches);
- parent->flags = 0;
-
- inotify_init_watch(&parent->wdata);
- /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
- get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata,
- ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
- if (wd < 0) {
- audit_free_parent(&parent->wdata);
- return ERR_PTR(wd);
+
+ fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+ parent->mark.mask = AUDIT_FS_WATCH;
+ ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+ if (ret < 0) {
+ audit_free_parent(parent);
+ return ERR_PTR(ret);
}
return parent;
@@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
- if (!audit_ih)
+ if (!audit_watch_group)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
@@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
new->dev = old->dev;
new->ino = old->ino;
- get_inotify_watch(&old->parent->wdata);
+ audit_get_parent(old->parent);
new->parent = old->parent;
out:
@@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent,
struct audit_entry *oentry, *nentry;
mutex_lock(&audit_filter_mutex);
+ /* Run all of the watches on this parent looking for the one that
+ * matches the given dname */
list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
if (audit_compare_dname_path(dname, owatch->path, NULL))
continue;
/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
- if (invalidating && current->audit_context)
+ if (invalidating && !audit_dummy_context())
audit_filter_inodes(current, current->audit_context);
+ /* updating ino will likely change which audit_hash_list we
+ * are on so we need a new watch for the new list */
nwatch = audit_dupe_watch(owatch);
if (IS_ERR(nwatch)) {
mutex_unlock(&audit_filter_mutex);
@@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent,
list_del(&oentry->rule.rlist);
list_del_rcu(&oentry->list);
- nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ nentry = audit_dupe_rule(&oentry->rule);
if (IS_ERR(nentry)) {
list_del(&oentry->rule.list);
audit_panic("error updating watch, removing");
} else {
int h = audit_hash_ino((u32)ino);
+
+ /*
+ * nentry->rule.watch == oentry->rule.watch so
+ * we must drop that reference and set it to our
+ * new watch.
+ */
+ audit_put_watch(nentry->rule.watch);
+ audit_get_watch(nwatch);
+ nentry->rule.watch = nwatch;
list_add(&nentry->rule.rlist, &nwatch->rules);
list_add_rcu(&nentry->list, &audit_inode_hash[h]);
list_replace(&oentry->rule.list,
@@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_entry *e;
mutex_lock(&audit_filter_mutex);
- parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
@@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
audit_remove_watch(w);
}
mutex_unlock(&audit_filter_mutex);
-}
-
-/* Unregister inotify watches for parents on in_list.
- * Generates an IN_IGNORED event. */
-void audit_inotify_unregister(struct list_head *in_list)
-{
- struct audit_parent *p, *n;
- list_for_each_entry_safe(p, n, in_list, ilist) {
- list_del(&p->ilist);
- inotify_rm_watch(audit_ih, &p->wdata);
- /* the unpin matching the pin in audit_do_del_rule() */
- unpin_inotify_watch(&p->wdata);
- }
+ fsnotify_destroy_mark(&parent->mark);
}
/* Get path information necessary for adding watches. */
@@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
}
}
-/* Associate the given rule with an existing parent inotify_watch.
+/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
struct audit_parent *parent)
@@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule,
struct audit_watch *w, *watch = krule->watch;
int watch_found = 0;
+ BUG_ON(!mutex_is_locked(&audit_filter_mutex));
+
list_for_each_entry(w, &parent->watches, wlist) {
if (strcmp(watch->path, w->path))
continue;
@@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule,
}
if (!watch_found) {
- get_inotify_watch(&parent->wdata);
+ audit_get_parent(parent);
watch->parent = parent;
list_add(&watch->wlist, &parent->watches);
@@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule,
/* Find a matching watch entry, or add this one.
* Caller must hold audit_filter_mutex. */
-int audit_add_watch(struct audit_krule *krule)
+int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
- struct inotify_watch *i_watch;
struct audit_parent *parent;
struct nameidata *ndp = NULL, *ndw = NULL;
- int ret = 0;
+ int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
@@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule)
goto error;
}
+ mutex_lock(&audit_filter_mutex);
+
/* update watch filter fields */
if (ndw) {
watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
watch->ino = ndw->path.dentry->d_inode->i_ino;
}
- /* The audit_filter_mutex must not be held during inotify calls because
- * we hold it during inotify event callback processing. If an existing
- * inotify watch is found, inotify_find_watch() grabs a reference before
- * returning.
- */
- if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
- &i_watch) < 0) {
+ /* either find an old parent or attach a new one */
+ parent = audit_find_parent(ndp->path.dentry->d_inode);
+ if (!parent) {
parent = audit_init_parent(ndp);
if (IS_ERR(parent)) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
ret = PTR_ERR(parent);
goto error;
}
- } else
- parent = container_of(i_watch, struct audit_parent, wdata);
-
- mutex_lock(&audit_filter_mutex);
+ }
- /* parent was moved before we took audit_filter_mutex */
- if (parent->flags & AUDIT_PARENT_INVALID)
- ret = -ENOENT;
- else
- audit_add_to_parent(krule, parent);
+ audit_add_to_parent(krule, parent);
- /* match get in audit_init_parent or inotify_find_watch */
- put_inotify_watch(&parent->wdata);
+ /* match get in audit_find_parent or audit_init_parent */
+ audit_put_parent(parent);
+ h = audit_hash_ino((u32)watch->ino);
+ *list = &audit_inode_hash[h];
error:
audit_put_nd(ndp, ndw); /* NULL args OK */
return ret;
}
-void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
+void audit_remove_watch_rule(struct audit_krule *krule)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent = watch->parent;
@@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list)
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
- /* Put parent on the inotify un-registration
- * list. Grab a reference before releasing
- * audit_filter_mutex, to be released in
- * audit_inotify_unregister().
- * If filesystem is going away, just leave
- * the sucker alone, eviction will take
- * care of it. */
- if (pin_inotify_watch(&parent->wdata))
- list_add(&parent->ilist, list);
+ audit_get_parent(parent);
+ fsnotify_destroy_mark(&parent->mark);
+ audit_put_parent(parent);
}
}
}
-/* Update watch data in audit rules based on inotify events. */
-static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
- u32 cookie, const char *dname, struct inode *inode)
+static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ __u32 mask, void *data, int data_type)
+{
+ return true;
+}
+
+/* Update watch data in audit rules based on fsnotify events. */
+static int audit_watch_handle_event(struct fsnotify_group *group,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ struct fsnotify_event *event)
{
+ struct inode *inode;
+ __u32 mask = event->mask;
+ const char *dname = event->file_name;
struct audit_parent *parent;
- parent = container_of(i_watch, struct audit_parent, wdata);
+ parent = container_of(inode_mark, struct audit_parent, mark);
- if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
- audit_update_watch(parent, dname, inode->i_sb->s_dev,
- inode->i_ino, 0);
- else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ BUG_ON(group != audit_watch_group);
+
+ switch (event->data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = event->path.dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = event->inode;
+ break;
+ default:
+ BUG();
+ inode = NULL;
+ break;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
+ else if (mask & (FS_DELETE|FS_MOVED_FROM))
audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
- /* inotify automatically removes the watch and sends IN_IGNORED */
- else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
- audit_remove_parent_watches(parent);
- /* inotify does not remove the watch, so remove it manually */
- else if(mask & IN_MOVE_SELF) {
+ else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
- inotify_remove_watch_locked(audit_ih, i_watch);
- } else if (mask & IN_IGNORED)
- put_inotify_watch(i_watch);
+
+ return 0;
}
-static const struct inotify_operations audit_inotify_ops = {
- .handle_event = audit_handle_ievent,
- .destroy_watch = audit_free_parent,
+static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+ .should_send_event = audit_watch_should_send_event,
+ .handle_event = audit_watch_handle_event,
+ .free_group_priv = NULL,
+ .freeing_mark = NULL,
+ .free_event_priv = NULL,
};
static int __init audit_watch_init(void)
{
- audit_ih = inotify_init(&audit_inotify_ops);
- if (IS_ERR(audit_ih))
- audit_panic("cannot initialize inotify handle");
+ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
return 0;
}
-subsys_initcall(audit_watch_init);
+device_initcall(audit_watch_init);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ce08041f578d..eb7675499fb5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
+
/* some rules don't have associated watches */
if (erule->watch)
audit_put_watch(erule->watch);
@@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
* rule with the new rule in the filterlist, then free the old rule.
* The rlist element is undefined; list manipulations are handled apart from
* the initial copy. */
-struct audit_entry *audit_dupe_rule(struct audit_krule *old,
- struct audit_watch *watch)
+struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
new->prio = old->prio;
new->buflen = old->buflen;
new->inode_f = old->inode_f;
- new->watch = NULL;
new->field_count = old->field_count;
+
/*
* note that we are OK with not refcounting here; audit_match_tree()
* never dereferences tree and we can't get false positives there
@@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old,
}
}
- if (watch) {
- audit_get_watch(watch);
- new->watch = watch;
+ if (old->watch) {
+ audit_get_watch(old->watch);
+ new->watch = old->watch;
}
return entry;
@@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int h, err;
+ int err;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry)
if (watch) {
/* audit_filter_mutex is dropped and re-taken during this call */
- err = audit_add_watch(&entry->rule);
+ err = audit_add_watch(&entry->rule, &list);
if (err) {
mutex_unlock(&audit_filter_mutex);
goto error;
}
- /* entry->rule.watch may have changed during audit_add_watch() */
- watch = entry->rule.watch;
- h = audit_hash_ino((u32)audit_watch_inode(watch));
- list = &audit_inode_hash[h];
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
@@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- LIST_HEAD(inotify_list);
int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry)
}
if (e->rule.watch)
- audit_remove_watch_rule(&e->rule, &inotify_list);
+ audit_remove_watch_rule(&e->rule);
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
@@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- if (!list_empty(&inotify_list))
- audit_inotify_unregister(&inotify_list);
-
out:
if (watch)
audit_put_watch(watch); /* match initial get */
@@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r)
{
struct audit_entry *entry = container_of(r, struct audit_entry, rule);
struct audit_entry *nentry;
- struct audit_watch *watch;
- struct audit_tree *tree;
int err = 0;
if (!security_audit_rule_known(r))
return 0;
- watch = r->watch;
- tree = r->tree;
- nentry = audit_dupe_rule(r, watch);
+ nentry = audit_dupe_rule(r);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
err = PTR_ERR(nentry);
audit_panic("error updating LSM filters");
- if (watch)
+ if (r->watch)
list_del(&r->rlist);
list_del_rcu(&entry->list);
list_del(&r->list);
} else {
- if (watch) {
- list_add(&nentry->rule.rlist, audit_watch_rules(watch));
- list_del(&r->rlist);
- } else if (tree)
+ if (r->watch || r->tree)
list_replace_init(&r->rlist, &nentry->rule.rlist);
list_replace_rcu(&entry->list, &nentry->list);
list_replace(&r->list, &nentry->rule.list);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3828ad5fb8f1..1b31c130d034 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,7 +65,6 @@
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/inotify.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
@@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name && audit_watch_inode(rule->watch) != (unsigned long)-1)
- result = (name->dev == audit_watch_dev(rule->watch) &&
- name->ino == audit_watch_inode(rule->watch));
+ if (name)
+ result = audit_watch_compare(rule->watch, name->ino, name->dev);
break;
case AUDIT_DIR:
if (ctx)
@@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
- if (likely(list_empty(&inode->inotify_watches)))
+ if (likely(hlist_empty(&inode->i_fsnotify_marks)))
return;
context = current->audit_context;
p = context->trees;
@@ -1769,7 +1767,7 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d->d_inode;
- if (inode && unlikely(!list_empty(&inode->inotify_watches))) {
+ if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
struct audit_chunk *chunk;
chunk = audit_tree_lookup(inode);
if (chunk) {
@@ -1837,13 +1835,8 @@ void __audit_getname(const char *name)
context->names[context->name_count].ino = (unsigned long)-1;
context->names[context->name_count].osid = 0;
++context->name_count;
- if (!context->pwd.dentry) {
- read_lock(&current->fs->lock);
- context->pwd = current->fs->pwd;
- path_get(&current->fs->pwd);
- read_unlock(&current->fs->lock);
- }
-
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
}
/* audit_putname - intercept a putname request
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b0a64b..192f88c5b0f9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (opts->release_agent)
return -EINVAL;
opts->release_agent =
- kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
} else if (!strncmp(token, "name=", 5)) {
@@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
@@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = {
.kill_sb = cgroup_kill_sb,
};
+static struct kobject *cgroup_kobj;
+
static inline struct cgroup *__d_cgrp(struct dentry *dentry)
{
return dentry->d_fsdata;
@@ -1788,6 +1790,29 @@ out:
return retval;
}
+/**
+ * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_current_cg(struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ struct cgroup *cur_cg;
+ int retval = 0;
+
+ cgroup_lock();
+ for_each_active_root(root) {
+ cur_cg = task_cgroup_from_root(current, root);
+ retval = cgroup_attach_task(cur_cg, tsk);
+ if (retval)
+ break;
+ }
+ cgroup_unlock();
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
+
/*
* Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
* held. May take task_lock of task
@@ -3871,9 +3896,18 @@ int __init cgroup_init(void)
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
BUG_ON(!init_root_id(&rootnode));
+
+ cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
+ if (!cgroup_kobj) {
+ err = -ENOMEM;
+ goto out;
+ }
+
err = register_filesystem(&cgroup_fs_type);
- if (err < 0)
+ if (err < 0) {
+ kobject_put(cgroup_kobj);
goto out;
+ }
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..e167efce8423 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs ();
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
__get_user(r.rlim_cur, &rlim->rlim_cur) ||
@@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
r.rlim_cur = RLIM_INFINITY;
if (r.rlim_max == COMPAT_RLIM_INFINITY)
r.rlim_max = RLIM_INFINITY;
- set_fs(KERNEL_DS);
- ret = sys_setrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
- return ret;
+ return do_prlimit(current, resource, &r, NULL);
}
#ifdef COMPAT_RLIM_OLD_INFINITY
@@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
#endif
-asmlinkage long compat_sys_getrlimit (unsigned int resource,
+asmlinkage long compat_sys_getrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
struct rlimit r;
int ret;
- mm_segment_t old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_getrlimit(resource, (struct rlimit __user *) &r);
- set_fs(old_fs);
+ ret = do_prlimit(current, resource, NULL, &r);
if (!ret) {
if (r.rlim_cur > COMPAT_RLIM_INFINITY)
r.rlim_cur = COMPAT_RLIM_INFINITY;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
- set_cpu_active(cpu, false);
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
- set_cpu_active(cpu, true);
-
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
- set_cpu_active(cpu, true);
/* CPU didn't die: tell everyone. Can't complain. */
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
- set_cpu_active(cpu, true);
-
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset heirarchy */
+ /* used for walking a cpuset hierarchy */
struct list_head stack_list;
};
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* but making no active use of cpusets.
*
* This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_online_map on each CPU hotplug (cpuhp) event.
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*/
-static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
- unsigned long phase, void *unused_cpu)
+void cpuset_update_active_cpus(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
- switch (phase) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
cgroup_lock();
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
- return NOTIFY_OK;
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..9a3e22641fe7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -22,10 +22,6 @@
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#else
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
#define kdebug(FMT, ...) \
no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
#endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 8bc5eeffec8a..3c2d4972d235 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -6,7 +6,7 @@
* Copyright (C) 2000-2001 VERITAS Software Corporation.
* Copyright (C) 2002-2004 Timesys Corporation
* Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
* Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
* Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -605,6 +605,8 @@ cpu_master_loop:
if (dbg_kdb_mode) {
kgdb_connected = 1;
error = kdb_stub(ks);
+ if (error == -1)
+ continue;
kgdb_connected = 0;
} else {
error = gdb_serial_stub(ks);
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index e8fd6868682d..481a7bd2dfe7 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -6,7 +6,7 @@
* Copyright (C) 2000-2001 VERITAS Software Corporation.
* Copyright (C) 2002-2004 Timesys Corporation
* Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
- * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
* Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
* Copyright (C) 2005-2009 Wind River Systems, Inc.
@@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES +
* GDB remote protocol parser:
*/
-static int hex(char ch)
-{
- if ((ch >= 'a') && (ch <= 'f'))
- return ch - 'a' + 10;
- if ((ch >= '0') && (ch <= '9'))
- return ch - '0';
- if ((ch >= 'A') && (ch <= 'F'))
- return ch - 'A' + 10;
- return -1;
-}
-
#ifdef CONFIG_KGDB_KDB
static int gdbstub_read_wait(void)
{
@@ -123,8 +112,8 @@ static void get_packet(char *buffer)
buffer[count] = 0;
if (ch == '#') {
- xmitcsum = hex(gdbstub_read_wait()) << 4;
- xmitcsum += hex(gdbstub_read_wait());
+ xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4;
+ xmitcsum += hex_to_bin(gdbstub_read_wait());
if (checksum != xmitcsum)
/* failed checksum */
@@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len)
* buf. Return a pointer to the last char put in buf (null). May
* return an error.
*/
-int kgdb_mem2hex(char *mem, char *buf, int count)
+char *kgdb_mem2hex(char *mem, char *buf, int count)
{
char *tmp;
int err;
@@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
tmp = buf + count;
err = probe_kernel_read(tmp, mem, count);
- if (!err) {
- while (count > 0) {
- buf = pack_hex_byte(buf, *tmp);
- tmp++;
- count--;
- }
-
- *buf = 0;
+ if (err)
+ return NULL;
+ while (count > 0) {
+ buf = pack_hex_byte(buf, *tmp);
+ tmp++;
+ count--;
}
+ *buf = 0;
- return err;
+ return buf;
}
/*
@@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
tmp_hex = tmp_raw - 1;
while (tmp_hex >= buf) {
tmp_raw--;
- *tmp_raw = hex(*tmp_hex--);
- *tmp_raw |= hex(*tmp_hex--) << 4;
+ *tmp_raw = hex_to_bin(*tmp_hex--);
+ *tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
}
return probe_kernel_write(mem, tmp_raw, count);
@@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
(*ptr)++;
}
while (**ptr) {
- hex_val = hex(**ptr);
+ hex_val = hex_to_bin(**ptr);
if (hex_val < 0)
break;
@@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
return probe_kernel_write(mem, c, size);
}
+#if DBG_MAX_REG_NUM > 0
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_get_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ int i;
+ int idx = 0;
+ char *ptr = (char *)gdb_regs;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ dbg_set_reg(i, ptr + idx, regs);
+ idx += dbg_reg_def[i].size;
+ }
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
/* Write memory due to an 'M' or 'X' packet. */
static int write_mem_msg(int binary)
{
@@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error)
* remapped to negative TIDs.
*/
-#define BUF_THREAD_ID_SIZE 16
+#define BUF_THREAD_ID_SIZE 8
static char *pack_threadid(char *pkt, unsigned char *id)
{
- char *limit;
+ unsigned char *limit;
+ int lzero = 1;
+
+ limit = id + (BUF_THREAD_ID_SIZE / 2);
+ while (id < limit) {
+ if (!lzero || *id != 0) {
+ pkt = pack_hex_byte(pkt, *id);
+ lzero = 0;
+ }
+ id++;
+ }
- limit = pkt + BUF_THREAD_ID_SIZE;
- while (pkt < limit)
- pkt = pack_hex_byte(pkt, *id++);
+ if (lzero)
+ pkt = pack_hex_byte(pkt, 0);
return pkt;
}
static void int_to_threadref(unsigned char *id, int value)
{
- unsigned char *scan;
- int i = 4;
-
- scan = (unsigned char *)id;
- while (i--)
- *scan++ = 0;
- put_unaligned_be32(value, scan);
+ put_unaligned_be32(value, id);
}
static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
pack_hex_byte(&remcom_out_buffer[1], ks->signo);
}
-/* Handle the 'g' get registers request */
-static void gdb_cmd_getregs(struct kgdb_state *ks)
+static void gdb_get_regs_helper(struct kgdb_state *ks)
{
struct task_struct *thread;
void *local_debuggerinfo;
@@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
*/
sleeping_thread_to_gdb_regs(gdb_regs, thread);
}
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ gdb_get_regs_helper(ks);
kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
}
@@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks)
char *ptr = &remcom_in_buffer[1];
unsigned long length;
unsigned long addr;
- int err;
+ char *err;
if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
kgdb_hex2long(&ptr, &length) > 0) {
err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
- if (err)
- error_packet(remcom_out_buffer, err);
+ if (!err)
+ error_packet(remcom_out_buffer, -EINVAL);
} else {
error_packet(remcom_out_buffer, -EINVAL);
}
@@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks)
strcpy(remcom_out_buffer, "OK");
}
+#if DBG_MAX_REG_NUM > 0
+static char *gdb_hex_reg_helper(int regnum, char *out)
+{
+ int i;
+ int offset = 0;
+
+ for (i = 0; i < regnum; i++)
+ offset += dbg_reg_def[i].size;
+ return kgdb_mem2hex((char *)gdb_regs + offset, out,
+ dbg_reg_def[i].size);
+}
+
+/* Handle the 'p' individual regster get */
+static void gdb_cmd_reg_get(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (regnum >= DBG_MAX_REG_NUM) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ gdb_get_regs_helper(ks);
+ gdb_hex_reg_helper(regnum, remcom_out_buffer);
+}
+
+/* Handle the 'P' individual regster set */
+static void gdb_cmd_reg_set(struct kgdb_state *ks)
+{
+ unsigned long regnum;
+ char *ptr = &remcom_in_buffer[1];
+ int i = 0;
+
+ kgdb_hex2long(&ptr, &regnum);
+ if (*ptr++ != '=' ||
+ !(!kgdb_usethread || kgdb_usethread == current) ||
+ !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ memset(gdb_regs, 0, sizeof(gdb_regs));
+ while (i < sizeof(gdb_regs) * 2)
+ if (hex_to_bin(ptr[i]) >= 0)
+ i++;
+ else
+ break;
+ i = i / 2;
+ kgdb_hex2mem(ptr, (char *)gdb_regs, i);
+ dbg_set_reg(regnum, gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+}
+#endif /* DBG_MAX_REG_NUM > 0 */
+
/* Handle the 'X' memory binary write bytes */
static void gdb_cmd_binwrite(struct kgdb_state *ks)
{
@@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
{
struct task_struct *g;
struct task_struct *p;
- unsigned char thref[8];
+ unsigned char thref[BUF_THREAD_ID_SIZE];
char *ptr;
int i;
int cpu;
@@ -632,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
for_each_online_cpu(cpu) {
ks->thr_query = 0;
int_to_threadref(thref, -cpu - 2);
- pack_threadid(ptr, thref);
- ptr += BUF_THREAD_ID_SIZE;
+ ptr = pack_threadid(ptr, thref);
*(ptr++) = ',';
i++;
}
@@ -642,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
do_each_thread(g, p) {
if (i >= ks->thr_query && !finished) {
int_to_threadref(thref, p->pid);
- pack_threadid(ptr, thref);
- ptr += BUF_THREAD_ID_SIZE;
+ ptr = pack_threadid(ptr, thref);
*(ptr++) = ',';
ks->thr_query++;
if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
@@ -858,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
int error = 0;
int tmp;
- /* Clear the out buffer. */
+ /* Initialize comm buffer and globals. */
memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
if (kgdb_connected) {
- unsigned char thref[8];
+ unsigned char thref[BUF_THREAD_ID_SIZE];
char *ptr;
/* Reply to host that an exception has occurred */
@@ -876,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks)
put_packet(remcom_out_buffer);
}
- kgdb_usethread = kgdb_info[ks->cpu].task;
- ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
- ks->pass_exception = 0;
-
while (1) {
error = 0;
@@ -904,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
gdb_cmd_memwrite(ks);
break;
+#if DBG_MAX_REG_NUM > 0
+ case 'p': /* pXX Return gdb register XX (in hex) */
+ gdb_cmd_reg_get(ks);
+ break;
+ case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */
+ gdb_cmd_reg_set(ks);
+ break;
+#endif /* DBG_MAX_REG_NUM > 0 */
case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
gdb_cmd_binwrite(ks);
break;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ebe4a287419e..28b844118bbd 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value)
if (endp == arg) {
/*
- * Try base 16, for us folks too lazy to type the
+ * Also try base 16, for us folks too lazy to type the
* leading 0x...
*/
val = simple_strtoul(arg, &endp, 16);
@@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value)
return 0;
}
+int kdbgetu64arg(const char *arg, u64 *value)
+{
+ char *endp;
+ u64 val;
+
+ val = simple_strtoull(arg, &endp, 0);
+
+ if (endp == arg) {
+
+ val = simple_strtoull(arg, &endp, 16);
+ if (endp == arg)
+ return KDB_BADINT;
+ }
+
+ *value = val;
+
+ return 0;
+}
+
/*
* kdb_set - This function implements the 'set' command. Alter an
* existing environment variable or create a new one.
@@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv)
*/
static int kdb_rd(int argc, const char **argv)
{
- int diag = kdb_check_regs();
- if (diag)
- return diag;
+ int len = kdb_check_regs();
+#if DBG_MAX_REG_NUM > 0
+ int i;
+ char *rname;
+ int rsize;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
+
+ if (len)
+ return len;
+
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ rsize = dbg_reg_def[i].size * 2;
+ if (rsize > 16)
+ rsize = 2;
+ if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) {
+ len = 0;
+ kdb_printf("\n");
+ }
+ if (len)
+ len += kdb_printf(" ");
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ rname = dbg_get_reg(i, &reg8, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %02x", rname, reg8);
+ break;
+ case 16:
+ rname = dbg_get_reg(i, &reg16, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %04x", rname, reg16);
+ break;
+ case 32:
+ rname = dbg_get_reg(i, &reg32, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %08x", rname, reg32);
+ break;
+ case 64:
+ rname = dbg_get_reg(i, &reg64, kdb_current_regs);
+ if (!rname)
+ break;
+ len += kdb_printf("%s: %016llx", rname, reg64);
+ break;
+ default:
+ len += kdb_printf("%s: ??", dbg_reg_def[i].name);
+ }
+ }
+ kdb_printf("\n");
+#else
+ if (len)
+ return len;
kdb_dumpregs(kdb_current_regs);
+#endif
return 0;
}
@@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv)
* kdb_rm - This function implements the 'rm' (register modify) command.
* rm register-name new-contents
* Remarks:
- * Currently doesn't allow modification of control or
- * debug registers.
+ * Allows register modification with the same restrictions as gdb
*/
static int kdb_rm(int argc, const char **argv)
{
+#if DBG_MAX_REG_NUM > 0
int diag;
- int ind = 0;
- unsigned long contents;
+ const char *rname;
+ int i;
+ u64 reg64;
+ u32 reg32;
+ u16 reg16;
+ u8 reg8;
if (argc != 2)
return KDB_ARGCOUNT;
/*
* Allow presence or absence of leading '%' symbol.
*/
- if (argv[1][0] == '%')
- ind = 1;
+ rname = argv[1];
+ if (*rname == '%')
+ rname++;
- diag = kdbgetularg(argv[2], &contents);
+ diag = kdbgetu64arg(argv[2], &reg64);
if (diag)
return diag;
diag = kdb_check_regs();
if (diag)
return diag;
+
+ diag = KDB_BADREG;
+ for (i = 0; i < DBG_MAX_REG_NUM; i++) {
+ if (strcmp(rname, dbg_reg_def[i].name) == 0) {
+ diag = 0;
+ break;
+ }
+ }
+ if (!diag) {
+ switch(dbg_reg_def[i].size * 8) {
+ case 8:
+ reg8 = reg64;
+ dbg_set_reg(i, &reg8, kdb_current_regs);
+ break;
+ case 16:
+ reg16 = reg64;
+ dbg_set_reg(i, &reg16, kdb_current_regs);
+ break;
+ case 32:
+ reg32 = reg64;
+ dbg_set_reg(i, &reg32, kdb_current_regs);
+ break;
+ case 64:
+ dbg_set_reg(i, &reg64, kdb_current_regs);
+ break;
+ }
+ }
+ return diag;
+#else
kdb_printf("ERROR: Register set currently not implemented\n");
- return 0;
+ return 0;
+#endif
}
#if defined(CONFIG_MAGIC_SYSRQ)
@@ -2440,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val)
*/
static int kdb_summary(int argc, const char **argv)
{
+ struct timespec now;
struct kdb_tm tm;
struct sysinfo val;
@@ -2454,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
kdb_printf("ccversion %s\n", __stringify(CCVERSION));
- kdb_gmtime(&xtime, &tm);
+ now = __current_kernel_time();
+ kdb_gmtime(&now, &tm);
kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
"tz_minuteswest %d\n",
1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 97d3ba69775d..c438f545a321 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
extern int kdb_putword(unsigned long, unsigned long, size_t);
extern int kdbgetularg(const char *, unsigned long *);
-extern int kdb_set(int, const char **);
extern char *kdbgetenv(const char *);
-extern int kdbgetintenv(const char *, int *);
extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
extern int kdbgetsymval(const char *, kdb_symtab_t *);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index dd62f8e714ca..0dbeae374225 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -134,23 +134,14 @@ unregister:
return 0;
}
-int
-__set_personality(unsigned int personality)
+int __set_personality(unsigned int personality)
{
- struct exec_domain *ep, *oep;
-
- ep = lookup_exec_domain(personality);
- if (ep == current_thread_info()->exec_domain) {
- current->personality = personality;
- module_put(ep->module);
- return 0;
- }
+ struct exec_domain *oep = current_thread_info()->exec_domain;
+ current_thread_info()->exec_domain = lookup_exec_domain(personality);
current->personality = personality;
- oep = current_thread_info()->exec_domain;
- current_thread_info()->exec_domain = ep;
-
module_put(oep->module);
+
return 0;
}
@@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
{
unsigned int old = current->personality;
- if (personality != 0xffffffff) {
+ if (personality != 0xffffffff)
set_personality(personality);
- if (current->personality != personality)
- return -EINVAL;
- }
return old;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..671ed56e0a49 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -771,9 +771,12 @@ static void forget_original_parent(struct task_struct *father)
struct task_struct *p, *n, *reaper;
LIST_HEAD(dead_children);
- exit_ptrace(father);
-
write_lock_irq(&tasklist_lock);
+ /*
+ * Note that exit_ptrace() and find_new_reaper() might
+ * drop tasklist_lock and reacquire it.
+ */
+ exit_ptrace(father);
reaper = find_new_reaper(father);
list_for_each_entry_safe(p, n, &father->children, sibling) {
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sig->oom_adj = current->signal->oom_adj;
+ sig->oom_score_adj = current->signal->oom_score_adj;
return 0;
}
@@ -907,7 +908,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
- new_flags &= ~PF_SUPERPRIV;
+ new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
new_flags |= PF_FORKNOEXEC;
new_flags |= PF_STARTING;
p->flags = new_flags;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..ce669174f355 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
do {
seq = read_seqbegin(&xtime_lock);
xts = __current_kernel_time();
- tom = wall_to_monotonic;
+ tom = __get_wall_to_monotonic();
} while (read_seqretry(&xtime_lock, seq));
xtim = timespec_to_ktime(xts);
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
static int hrtimer_get_target(int this_cpu, int pinned)
{
#ifdef CONFIG_NO_HZ
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
- int preferred_cpu = get_nohz_load_balancer();
-
- if (preferred_cpu >= 0)
- return preferred_cpu;
- }
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
+ return get_nohz_timer_target();
#endif
return this_cpu;
}
@@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base;
- struct timespec realtime_offset;
+ struct timespec realtime_offset, wtm;
unsigned long seq;
if (!hrtimer_hres_active())
@@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg)
do {
seq = read_seqbegin(&xtime_lock);
- set_normalized_timespec(&realtime_offset,
- -wall_to_monotonic.tv_sec,
- -wall_to_monotonic.tv_nsec);
+ wtm = __get_wall_to_monotonic();
} while (read_seqretry(&xtime_lock, seq));
+ set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
base = &__get_cpu_var(hrtimer_bases);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index e34d94d50924..d71a987fd2bf 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -242,6 +242,17 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
}
/*
+ * Function to perform processor-specific cleanup during unregistration
+ */
+__weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
+{
+ /*
+ * A weak stub function here for those archs that don't define
+ * it inside arch/.../kernel/hw_breakpoint.c
+ */
+}
+
+/*
* Contraints to check before allowing this new breakpoint counter:
*
* == Non-pinned counter == (Considered as pinned for now)
@@ -343,6 +354,7 @@ void release_bp_slot(struct perf_event *bp)
{
mutex_lock(&nr_bp_mutex);
+ arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp);
mutex_unlock(&nr_bp_mutex);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..c3003e9d91a3 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
{
if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_TIMER))
+ if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
return;
desc->status |= IRQ_SUSPENDED;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 131b1703936f..c0613f7d6730 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
image->nr_segments = nr_segments;
segment_bytes = nr_segments * sizeof(*segments);
result = copy_from_user(image->segment, segments, segment_bytes);
- if (result)
+ if (result) {
+ result = -EFAULT;
goto out;
+ }
/*
* Verify we have good destination addresses. The caller is
@@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image,
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
- result = (result < 0) ? result : -EIO;
+ result = -EFAULT;
goto out;
}
ubytes -= uchunk;
@@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image,
kexec_flush_icache_page(page);
kunmap(page);
if (result) {
- result = (result < 0) ? result : -EIO;
+ result = -EFAULT;
goto out;
}
ubytes -= uchunk;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 35edbe22e9a9..4502604ecadf 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,8 +1,7 @@
/*
- * A generic kernel FIFO implementation.
+ * A generic kernel FIFO implementation
*
- * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
- * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
+ * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -11,7 +10,7 @@
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
@@ -24,422 +23,579 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/kfifo.h>
#include <linux/log2.h>
#include <linux/uaccess.h>
+#include <linux/kfifo.h>
-static void _kfifo_init(struct kfifo *fifo, void *buffer,
- unsigned int size)
-{
- fifo->buffer = buffer;
- fifo->size = size;
-
- kfifo_reset(fifo);
-}
-
-/**
- * kfifo_init - initialize a FIFO using a preallocated buffer
- * @fifo: the fifo to assign the buffer
- * @buffer: the preallocated buffer to be used.
- * @size: the size of the internal buffer, this has to be a power of 2.
- *
+/*
+ * internal helper to calculate the unused elements in a fifo
*/
-void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
+static inline unsigned int kfifo_unused(struct __kfifo *fifo)
{
- /* size must be a power of 2 */
- BUG_ON(!is_power_of_2(size));
-
- _kfifo_init(fifo, buffer, size);
+ return (fifo->mask + 1) - (fifo->in - fifo->out);
}
-EXPORT_SYMBOL(kfifo_init);
-/**
- * kfifo_alloc - allocates a new FIFO internal buffer
- * @fifo: the fifo to assign then new buffer
- * @size: the size of the buffer to be allocated, this have to be a power of 2.
- * @gfp_mask: get_free_pages mask, passed to kmalloc()
- *
- * This function dynamically allocates a new fifo internal buffer
- *
- * The size will be rounded-up to a power of 2.
- * The buffer will be release with kfifo_free().
- * Return 0 if no error, otherwise the an error code
- */
-int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
+int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
+ size_t esize, gfp_t gfp_mask)
{
- unsigned char *buffer;
-
/*
- * round up to the next power of 2, since our 'let the indices
+ * round down to the next power of 2, since our 'let the indices
* wrap' technique works only in this case.
*/
- if (!is_power_of_2(size)) {
- BUG_ON(size > 0x80000000);
- size = roundup_pow_of_two(size);
+ if (!is_power_of_2(size))
+ size = rounddown_pow_of_two(size);
+
+ fifo->in = 0;
+ fifo->out = 0;
+ fifo->esize = esize;
+
+ if (size < 2) {
+ fifo->data = NULL;
+ fifo->mask = 0;
+ return -EINVAL;
}
- buffer = kmalloc(size, gfp_mask);
- if (!buffer) {
- _kfifo_init(fifo, NULL, 0);
+ fifo->data = kmalloc(size * esize, gfp_mask);
+
+ if (!fifo->data) {
+ fifo->mask = 0;
return -ENOMEM;
}
-
- _kfifo_init(fifo, buffer, size);
+ fifo->mask = size - 1;
return 0;
}
-EXPORT_SYMBOL(kfifo_alloc);
+EXPORT_SYMBOL(__kfifo_alloc);
-/**
- * kfifo_free - frees the FIFO internal buffer
- * @fifo: the fifo to be freed.
- */
-void kfifo_free(struct kfifo *fifo)
+void __kfifo_free(struct __kfifo *fifo)
{
- kfree(fifo->buffer);
- _kfifo_init(fifo, NULL, 0);
+ kfree(fifo->data);
+ fifo->in = 0;
+ fifo->out = 0;
+ fifo->esize = 0;
+ fifo->data = NULL;
+ fifo->mask = 0;
}
-EXPORT_SYMBOL(kfifo_free);
+EXPORT_SYMBOL(__kfifo_free);
-/**
- * kfifo_skip - skip output data
- * @fifo: the fifo to be used.
- * @len: number of bytes to skip
- */
-void kfifo_skip(struct kfifo *fifo, unsigned int len)
+int __kfifo_init(struct __kfifo *fifo, void *buffer,
+ unsigned int size, size_t esize)
{
- if (len < kfifo_len(fifo)) {
- __kfifo_add_out(fifo, len);
- return;
+ size /= esize;
+
+ if (!is_power_of_2(size))
+ size = rounddown_pow_of_two(size);
+
+ fifo->in = 0;
+ fifo->out = 0;
+ fifo->esize = esize;
+ fifo->data = buffer;
+
+ if (size < 2) {
+ fifo->mask = 0;
+ return -EINVAL;
}
- kfifo_reset_out(fifo);
+ fifo->mask = size - 1;
+
+ return 0;
}
-EXPORT_SYMBOL(kfifo_skip);
+EXPORT_SYMBOL(__kfifo_init);
-static inline void __kfifo_in_data(struct kfifo *fifo,
- const void *from, unsigned int len, unsigned int off)
+static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
+ unsigned int len, unsigned int off)
{
+ unsigned int size = fifo->mask + 1;
+ unsigned int esize = fifo->esize;
unsigned int l;
+ off &= fifo->mask;
+ if (esize != 1) {
+ off *= esize;
+ size *= esize;
+ len *= esize;
+ }
+ l = min(len, size - off);
+
+ memcpy(fifo->data + off, src, l);
+ memcpy(fifo->data, src + l, len - l);
/*
- * Ensure that we sample the fifo->out index -before- we
- * start putting bytes into the kfifo.
+ * make sure that the data in the fifo is up to date before
+ * incrementing the fifo->in index counter
*/
+ smp_wmb();
+}
- smp_mb();
-
- off = __kfifo_off(fifo, fifo->in + off);
+unsigned int __kfifo_in(struct __kfifo *fifo,
+ const void *buf, unsigned int len)
+{
+ unsigned int l;
- /* first put the data starting from fifo->in to buffer end */
- l = min(len, fifo->size - off);
- memcpy(fifo->buffer + off, from, l);
+ l = kfifo_unused(fifo);
+ if (len > l)
+ len = l;
- /* then put the rest (if any) at the beginning of the buffer */
- memcpy(fifo->buffer, from + l, len - l);
+ kfifo_copy_in(fifo, buf, len, fifo->in);
+ fifo->in += len;
+ return len;
}
+EXPORT_SYMBOL(__kfifo_in);
-static inline void __kfifo_out_data(struct kfifo *fifo,
- void *to, unsigned int len, unsigned int off)
+static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
+ unsigned int len, unsigned int off)
{
+ unsigned int size = fifo->mask + 1;
+ unsigned int esize = fifo->esize;
unsigned int l;
+ off &= fifo->mask;
+ if (esize != 1) {
+ off *= esize;
+ size *= esize;
+ len *= esize;
+ }
+ l = min(len, size - off);
+
+ memcpy(dst, fifo->data + off, l);
+ memcpy(dst + l, fifo->data, len - l);
/*
- * Ensure that we sample the fifo->in index -before- we
- * start removing bytes from the kfifo.
+ * make sure that the data is copied before
+ * incrementing the fifo->out index counter
*/
+ smp_wmb();
+}
- smp_rmb();
+unsigned int __kfifo_out_peek(struct __kfifo *fifo,
+ void *buf, unsigned int len)
+{
+ unsigned int l;
- off = __kfifo_off(fifo, fifo->out + off);
+ l = fifo->in - fifo->out;
+ if (len > l)
+ len = l;
- /* first get the data from fifo->out until the end of the buffer */
- l = min(len, fifo->size - off);
- memcpy(to, fifo->buffer + off, l);
+ kfifo_copy_out(fifo, buf, len, fifo->out);
+ return len;
+}
+EXPORT_SYMBOL(__kfifo_out_peek);
- /* then get the rest (if any) from the beginning of the buffer */
- memcpy(to + l, fifo->buffer, len - l);
+unsigned int __kfifo_out(struct __kfifo *fifo,
+ void *buf, unsigned int len)
+{
+ len = __kfifo_out_peek(fifo, buf, len);
+ fifo->out += len;
+ return len;
}
+EXPORT_SYMBOL(__kfifo_out);
-static inline int __kfifo_from_user_data(struct kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int off,
- unsigned *lenout)
+static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
+ const void __user *from, unsigned int len, unsigned int off,
+ unsigned int *copied)
{
+ unsigned int size = fifo->mask + 1;
+ unsigned int esize = fifo->esize;
unsigned int l;
- int ret;
+ unsigned long ret;
+ off &= fifo->mask;
+ if (esize != 1) {
+ off *= esize;
+ size *= esize;
+ len *= esize;
+ }
+ l = min(len, size - off);
+
+ ret = copy_from_user(fifo->data + off, from, l);
+ if (unlikely(ret))
+ ret = DIV_ROUND_UP(ret + len - l, esize);
+ else {
+ ret = copy_from_user(fifo->data, from + l, len - l);
+ if (unlikely(ret))
+ ret = DIV_ROUND_UP(ret, esize);
+ }
/*
- * Ensure that we sample the fifo->out index -before- we
- * start putting bytes into the kfifo.
+ * make sure that the data in the fifo is up to date before
+ * incrementing the fifo->in index counter
*/
+ smp_wmb();
+ *copied = len - ret;
+ /* return the number of elements which are not copied */
+ return ret;
+}
- smp_mb();
+int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
+ unsigned long len, unsigned int *copied)
+{
+ unsigned int l;
+ unsigned long ret;
+ unsigned int esize = fifo->esize;
+ int err;
- off = __kfifo_off(fifo, fifo->in + off);
+ if (esize != 1)
+ len /= esize;
- /* first put the data starting from fifo->in to buffer end */
- l = min(len, fifo->size - off);
- ret = copy_from_user(fifo->buffer + off, from, l);
- if (unlikely(ret)) {
- *lenout = ret;
- return -EFAULT;
- }
- *lenout = l;
+ l = kfifo_unused(fifo);
+ if (len > l)
+ len = l;
- /* then put the rest (if any) at the beginning of the buffer */
- ret = copy_from_user(fifo->buffer, from + l, len - l);
- *lenout += ret ? ret : len - l;
- return ret ? -EFAULT : 0;
+ ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
+ if (unlikely(ret)) {
+ len -= ret;
+ err = -EFAULT;
+ } else
+ err = 0;
+ fifo->in += len;
+ return err;
}
+EXPORT_SYMBOL(__kfifo_from_user);
-static inline int __kfifo_to_user_data(struct kfifo *fifo,
- void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
+static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
+ unsigned int len, unsigned int off, unsigned int *copied)
{
unsigned int l;
- int ret;
-
+ unsigned long ret;
+ unsigned int size = fifo->mask + 1;
+ unsigned int esize = fifo->esize;
+
+ off &= fifo->mask;
+ if (esize != 1) {
+ off *= esize;
+ size *= esize;
+ len *= esize;
+ }
+ l = min(len, size - off);
+
+ ret = copy_to_user(to, fifo->data + off, l);
+ if (unlikely(ret))
+ ret = DIV_ROUND_UP(ret + len - l, esize);
+ else {
+ ret = copy_to_user(to + l, fifo->data, len - l);
+ if (unlikely(ret))
+ ret = DIV_ROUND_UP(ret, esize);
+ }
/*
- * Ensure that we sample the fifo->in index -before- we
- * start removing bytes from the kfifo.
+ * make sure that the data is copied before
+ * incrementing the fifo->out index counter
*/
+ smp_wmb();
+ *copied = len - ret;
+ /* return the number of elements which are not copied */
+ return ret;
+}
- smp_rmb();
+int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
+ unsigned long len, unsigned int *copied)
+{
+ unsigned int l;
+ unsigned long ret;
+ unsigned int esize = fifo->esize;
+ int err;
- off = __kfifo_off(fifo, fifo->out + off);
+ if (esize != 1)
+ len /= esize;
- /* first get the data from fifo->out until the end of the buffer */
- l = min(len, fifo->size - off);
- ret = copy_to_user(to, fifo->buffer + off, l);
- *lenout = l;
+ l = fifo->in - fifo->out;
+ if (len > l)
+ len = l;
+ ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
if (unlikely(ret)) {
- *lenout -= ret;
- return -EFAULT;
- }
+ len -= ret;
+ err = -EFAULT;
+ } else
+ err = 0;
+ fifo->out += len;
+ return err;
+}
+EXPORT_SYMBOL(__kfifo_to_user);
- /* then get the rest (if any) from the beginning of the buffer */
- len -= l;
- ret = copy_to_user(to + l, fifo->buffer, len);
- if (unlikely(ret)) {
- *lenout += len - ret;
- return -EFAULT;
+static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
+ int nents, unsigned int len)
+{
+ int n;
+ unsigned int l;
+ unsigned int off;
+ struct page *page;
+
+ if (!nents)
+ return 0;
+
+ if (!len)
+ return 0;
+
+ n = 0;
+ page = virt_to_page(buf);
+ off = offset_in_page(buf);
+ l = 0;
+
+ while (len >= l + PAGE_SIZE - off) {
+ struct page *npage;
+
+ l += PAGE_SIZE;
+ buf += PAGE_SIZE;
+ npage = virt_to_page(buf);
+ if (page_to_phys(page) != page_to_phys(npage) - l) {
+ sg_set_page(sgl, page, l - off, off);
+ sgl = sg_next(sgl);
+ if (++n == nents || sgl == NULL)
+ return n;
+ page = npage;
+ len -= l - off;
+ l = off = 0;
+ }
}
- *lenout += len;
- return 0;
+ sg_set_page(sgl, page, len, off);
+ return n + 1;
}
-unsigned int __kfifo_in_n(struct kfifo *fifo,
- const void *from, unsigned int len, unsigned int recsize)
+static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
+ int nents, unsigned int len, unsigned int off)
{
- if (kfifo_avail(fifo) < len + recsize)
- return len + 1;
+ unsigned int size = fifo->mask + 1;
+ unsigned int esize = fifo->esize;
+ unsigned int l;
+ unsigned int n;
- __kfifo_in_data(fifo, from, len, recsize);
- return 0;
+ off &= fifo->mask;
+ if (esize != 1) {
+ off *= esize;
+ size *= esize;
+ len *= esize;
+ }
+ l = min(len, size - off);
+
+ n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
+ n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
+
+ if (n)
+ sg_mark_end(sgl + n - 1);
+ return n;
}
-EXPORT_SYMBOL(__kfifo_in_n);
-/**
- * kfifo_in - puts some data into the FIFO
- * @fifo: the fifo to be used.
- * @from: the data to be added.
- * @len: the length of the data to be added.
- *
- * This function copies at most @len bytes from the @from buffer into
- * the FIFO depending on the free space, and returns the number of
- * bytes copied.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-unsigned int kfifo_in(struct kfifo *fifo, const void *from,
- unsigned int len)
+unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
+ struct scatterlist *sgl, int nents, unsigned int len)
{
- len = min(kfifo_avail(fifo), len);
+ unsigned int l;
- __kfifo_in_data(fifo, from, len, 0);
- __kfifo_add_in(fifo, len);
- return len;
+ l = kfifo_unused(fifo);
+ if (len > l)
+ len = l;
+
+ return setup_sgl(fifo, sgl, nents, len, fifo->in);
}
-EXPORT_SYMBOL(kfifo_in);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-unsigned int __kfifo_in_generic(struct kfifo *fifo,
- const void *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
+ struct scatterlist *sgl, int nents, unsigned int len)
{
- return __kfifo_in_rec(fifo, from, len, recsize);
+ unsigned int l;
+
+ l = fifo->in - fifo->out;
+ if (len > l)
+ len = l;
+
+ return setup_sgl(fifo, sgl, nents, len, fifo->out);
}
-EXPORT_SYMBOL(__kfifo_in_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-unsigned int __kfifo_out_n(struct kfifo *fifo,
- void *to, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
{
- if (kfifo_len(fifo) < len + recsize)
- return len;
+ unsigned int max = (1 << (recsize << 3)) - 1;
- __kfifo_out_data(fifo, to, len, recsize);
- __kfifo_add_out(fifo, len + recsize);
- return 0;
+ if (len > max)
+ return max;
+ return len;
}
-EXPORT_SYMBOL(__kfifo_out_n);
-/**
- * kfifo_out - gets some data from the FIFO
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and returns the number of copied bytes.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
+#define __KFIFO_PEEK(data, out, mask) \
+ ((data)[(out) & (mask)])
+/*
+ * __kfifo_peek_n internal helper function for determinate the length of
+ * the next record in the fifo
*/
-unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
+static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
{
- len = min(kfifo_len(fifo), len);
+ unsigned int l;
+ unsigned int mask = fifo->mask;
+ unsigned char *data = fifo->data;
- __kfifo_out_data(fifo, to, len, 0);
- __kfifo_add_out(fifo, len);
+ l = __KFIFO_PEEK(data, fifo->out, mask);
- return len;
+ if (--recsize)
+ l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
+
+ return l;
}
-EXPORT_SYMBOL(kfifo_out);
-
-/**
- * kfifo_out_peek - copy some data from the FIFO, but do not remove it
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @offset: offset into the fifo
- *
- * This function copies at most @len bytes at @offset from the FIFO
- * into the @to buffer and returns the number of copied bytes.
- * The data is not removed from the FIFO.
+
+#define __KFIFO_POKE(data, in, mask, val) \
+ ( \
+ (data)[(in) & (mask)] = (unsigned char)(val) \
+ )
+
+/*
+ * __kfifo_poke_n internal helper function for storeing the length of
+ * the record into the fifo
*/
-unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
- unsigned offset)
+static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
{
- len = min(kfifo_len(fifo), len + offset);
+ unsigned int mask = fifo->mask;
+ unsigned char *data = fifo->data;
- __kfifo_out_data(fifo, to, len, offset);
- return len;
+ __KFIFO_POKE(data, fifo->in, mask, n);
+
+ if (recsize > 1)
+ __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
}
-EXPORT_SYMBOL(kfifo_out_peek);
-unsigned int __kfifo_out_generic(struct kfifo *fifo,
- void *to, unsigned int len, unsigned int recsize,
- unsigned int *total)
+unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
{
- return __kfifo_out_rec(fifo, to, len, recsize, total);
+ return __kfifo_peek_n(fifo, recsize);
}
-EXPORT_SYMBOL(__kfifo_out_generic);
+EXPORT_SYMBOL(__kfifo_len_r);
-unsigned int __kfifo_from_user_n(struct kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
+ unsigned int len, size_t recsize)
{
- unsigned total;
+ if (len + recsize > kfifo_unused(fifo))
+ return 0;
- if (kfifo_avail(fifo) < len + recsize)
- return len + 1;
+ __kfifo_poke_n(fifo, len, recsize);
- __kfifo_from_user_data(fifo, from, len, recsize, &total);
- return total;
+ kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
+ fifo->in += len + recsize;
+ return len;
}
-EXPORT_SYMBOL(__kfifo_from_user_n);
-
-/**
- * kfifo_from_user - puts some data from user space into the FIFO
- * @fifo: the fifo to be used.
- * @from: pointer to the data to be added.
- * @len: the length of the data to be added.
- * @total: the actual returned data length.
- *
- * This function copies at most @len bytes from the @from into the
- * FIFO depending and returns -EFAULT/0.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_from_user(struct kfifo *fifo,
- const void __user *from, unsigned int len, unsigned *total)
+EXPORT_SYMBOL(__kfifo_in_r);
+
+static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
+ void *buf, unsigned int len, size_t recsize, unsigned int *n)
{
- int ret;
- len = min(kfifo_avail(fifo), len);
- ret = __kfifo_from_user_data(fifo, from, len, 0, total);
- if (ret)
- return ret;
- __kfifo_add_in(fifo, len);
- return 0;
+ *n = __kfifo_peek_n(fifo, recsize);
+
+ if (len > *n)
+ len = *n;
+
+ kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
+ return len;
+}
+
+unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
+ unsigned int len, size_t recsize)
+{
+ unsigned int n;
+
+ if (fifo->in == fifo->out)
+ return 0;
+
+ return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
}
-EXPORT_SYMBOL(kfifo_from_user);
+EXPORT_SYMBOL(__kfifo_out_peek_r);
-unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int recsize)
+unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
+ unsigned int len, size_t recsize)
{
- return __kfifo_from_user_rec(fifo, from, len, recsize);
+ unsigned int n;
+
+ if (fifo->in == fifo->out)
+ return 0;
+
+ len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
+ fifo->out += n + recsize;
+ return len;
}
-EXPORT_SYMBOL(__kfifo_from_user_generic);
+EXPORT_SYMBOL(__kfifo_out_r);
-unsigned int __kfifo_to_user_n(struct kfifo *fifo,
- void __user *to, unsigned int len, unsigned int reclen,
- unsigned int recsize)
+int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
+ unsigned long len, unsigned int *copied, size_t recsize)
{
- unsigned int ret, total;
+ unsigned long ret;
- if (kfifo_len(fifo) < reclen + recsize)
- return len;
+ len = __kfifo_max_r(len, recsize);
- ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
+ if (len + recsize > kfifo_unused(fifo)) {
+ *copied = 0;
+ return 0;
+ }
- if (likely(ret == 0))
- __kfifo_add_out(fifo, reclen + recsize);
+ __kfifo_poke_n(fifo, len, recsize);
- return total;
+ ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
+ if (unlikely(ret)) {
+ *copied = 0;
+ return -EFAULT;
+ }
+ fifo->in += len + recsize;
+ return 0;
}
-EXPORT_SYMBOL(__kfifo_to_user_n);
-
-/**
- * kfifo_to_user - gets data from the FIFO and write it to user space
- * @fifo: the fifo to be used.
- * @to: where the data must be copied.
- * @len: the size of the destination buffer.
- * @lenout: pointer to output variable with copied data
- *
- * This function copies at most @len bytes from the FIFO into the
- * @to buffer and 0 or -EFAULT.
- *
- * Note that with only one concurrent reader and one concurrent
- * writer, you don't need extra locking to use these functions.
- */
-int kfifo_to_user(struct kfifo *fifo,
- void __user *to, unsigned int len, unsigned *lenout)
+EXPORT_SYMBOL(__kfifo_from_user_r);
+
+int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
+ unsigned long len, unsigned int *copied, size_t recsize)
{
- int ret;
- len = min(kfifo_len(fifo), len);
- ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
- __kfifo_add_out(fifo, *lenout);
- return ret;
+ unsigned long ret;
+ unsigned int n;
+
+ if (fifo->in == fifo->out) {
+ *copied = 0;
+ return 0;
+ }
+
+ n = __kfifo_peek_n(fifo, recsize);
+ if (len > n)
+ len = n;
+
+ ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
+ if (unlikely(ret)) {
+ *copied = 0;
+ return -EFAULT;
+ }
+ fifo->out += n + recsize;
+ return 0;
}
-EXPORT_SYMBOL(kfifo_to_user);
+EXPORT_SYMBOL(__kfifo_to_user_r);
-unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
- void __user *to, unsigned int len, unsigned int recsize,
- unsigned int *total)
+unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
+ struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
{
- return __kfifo_to_user_rec(fifo, to, len, recsize, total);
+ if (!nents)
+ BUG();
+
+ len = __kfifo_max_r(len, recsize);
+
+ if (len + recsize > kfifo_unused(fifo))
+ return 0;
+
+ return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
}
-EXPORT_SYMBOL(__kfifo_to_user_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
+void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
+ unsigned int len, size_t recsize)
{
- if (recsize == 0)
- return kfifo_avail(fifo);
-
- return __kfifo_peek_n(fifo, recsize);
+ len = __kfifo_max_r(len, recsize);
+ __kfifo_poke_n(fifo, len, recsize);
+ fifo->in += len + recsize;
}
-EXPORT_SYMBOL(__kfifo_peek_generic);
+EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
+unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
+ struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
{
- __kfifo_skip_rec(fifo, recsize);
+ if (!nents)
+ BUG();
+
+ len = __kfifo_max_r(len, recsize);
+
+ if (len + recsize > fifo->in - fifo->out)
+ return 0;
+
+ return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
}
-EXPORT_SYMBOL(__kfifo_skip_generic);
+EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
+
+void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
+{
+ unsigned int len;
+ len = __kfifo_peek_n(fifo, recsize);
+ fifo->out += len + recsize;
+}
+EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 83911c780175..2dc3786349d1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,8 @@
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/freezer.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -35,6 +37,7 @@ struct kthread_create_info
struct kthread {
int should_stop;
+ void *data;
struct completion exited;
};
@@ -54,6 +57,19 @@ int kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+/**
+ * kthread_data - return data value specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Return the data value specified when kthread @task was created.
+ * The caller is responsible for ensuring the validity of @task when
+ * calling this function.
+ */
+void *kthread_data(struct task_struct *task)
+{
+ return to_kthread(task)->data;
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -64,6 +80,7 @@ static int kthread(void *_create)
int ret;
self.should_stop = 0;
+ self.data = data;
init_completion(&self.exited);
current->vfork_done = &self.exited;
@@ -247,3 +264,150 @@ int kthreadd(void *unused)
return 0;
}
+
+/**
+ * kthread_worker_fn - kthread function to process kthread_worker
+ * @worker_ptr: pointer to initialized kthread_worker
+ *
+ * This function can be used as @threadfn to kthread_create() or
+ * kthread_run() with @worker_ptr argument pointing to an initialized
+ * kthread_worker. The started kthread will process work_list until
+ * the it is stopped with kthread_stop(). A kthread can also call
+ * this function directly after extra initialization.
+ *
+ * Different kthreads can be used for the same kthread_worker as long
+ * as there's only one kthread attached to it at any given time. A
+ * kthread_worker without an attached kthread simply collects queued
+ * kthread_works.
+ */
+int kthread_worker_fn(void *worker_ptr)
+{
+ struct kthread_worker *worker = worker_ptr;
+ struct kthread_work *work;
+
+ WARN_ON(worker->task);
+ worker->task = current;
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irq(&worker->lock);
+ worker->task = NULL;
+ spin_unlock_irq(&worker->lock);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&worker->lock);
+ if (!list_empty(&worker->work_list)) {
+ work = list_first_entry(&worker->work_list,
+ struct kthread_work, node);
+ list_del_init(&work->node);
+ }
+ spin_unlock_irq(&worker->lock);
+
+ if (work) {
+ __set_current_state(TASK_RUNNING);
+ work->func(work);
+ smp_wmb(); /* wmb worker-b0 paired with flush-b1 */
+ work->done_seq = work->queue_seq;
+ smp_mb(); /* mb worker-b1 paired with flush-b0 */
+ if (atomic_read(&work->flushing))
+ wake_up_all(&work->done);
+ } else if (!freezing(current))
+ schedule();
+
+ try_to_freeze();
+ goto repeat;
+}
+EXPORT_SYMBOL_GPL(kthread_worker_fn);
+
+/**
+ * queue_kthread_work - queue a kthread_work
+ * @worker: target kthread_worker
+ * @work: kthread_work to queue
+ *
+ * Queue @work to work processor @task for async execution. @task
+ * must have been created with kthread_worker_create(). Returns %true
+ * if @work was successfully queued, %false if it was already pending.
+ */
+bool queue_kthread_work(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ bool ret = false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (list_empty(&work->node)) {
+ list_add_tail(&work->node, &worker->work_list);
+ work->queue_seq++;
+ if (likely(worker->task))
+ wake_up_process(worker->task);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_kthread_work);
+
+/**
+ * flush_kthread_work - flush a kthread_work
+ * @work: work to flush
+ *
+ * If @work is queued or executing, wait for it to finish execution.
+ */
+void flush_kthread_work(struct kthread_work *work)
+{
+ int seq = work->queue_seq;
+
+ atomic_inc(&work->flushing);
+
+ /*
+ * mb flush-b0 paired with worker-b1, to make sure either
+ * worker sees the above increment or we see done_seq update.
+ */
+ smp_mb__after_atomic_inc();
+
+ /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+ wait_event(work->done, seq - work->done_seq <= 0);
+ atomic_dec(&work->flushing);
+
+ /*
+ * rmb flush-b1 paired with worker-b0, to make sure our caller
+ * sees every change made by work->func().
+ */
+ smp_mb__after_atomic_dec();
+}
+EXPORT_SYMBOL_GPL(flush_kthread_work);
+
+struct kthread_flush_work {
+ struct kthread_work work;
+ struct completion done;
+};
+
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+ struct kthread_flush_work *fwork =
+ container_of(work, struct kthread_flush_work, work);
+ complete(&fwork->done);
+}
+
+/**
+ * flush_kthread_worker - flush all current works on a kthread_worker
+ * @worker: worker to flush
+ *
+ * Wait until all currently executing or pending works on @worker are
+ * finished.
+ */
+void flush_kthread_worker(struct kthread_worker *worker)
+{
+ struct kthread_flush_work fwork = {
+ KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
+ COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+ };
+
+ queue_kthread_work(worker, &fwork.work);
+ wait_for_completion(&fwork.done);
+}
+EXPORT_SYMBOL_GPL(flush_kthread_worker);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
static inline u64 lockstat_clock(void)
{
- return cpu_clock(smp_processor_id());
+ return local_clock();
}
static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/module.c b/kernel/module.c
index 6c562828c85c..d0b5f8db11b4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,6 +1,6 @@
/*
Copyright (C) 2002 Richard Henderson
- Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
+ Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb)
}
EXPORT_SYMBOL(unregister_module_notifier);
+struct load_info {
+ Elf_Ehdr *hdr;
+ unsigned long len;
+ Elf_Shdr *sechdrs;
+ char *secstrings, *strtab;
+ unsigned long *strmap;
+ unsigned long symoffs, stroffs;
+ struct _ddebug *debug;
+ unsigned int num_debug;
+ struct {
+ unsigned int sym, str, mod, vers, info, pcpu;
+ } index;
+};
+
/* We require a truly strong try_module_get(): 0 means failure due to
ongoing or failed initialization etc. */
static inline int strong_try_module_get(struct module *mod)
@@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code)
EXPORT_SYMBOL(__module_put_and_exit);
/* Find a module section: 0 means not found. */
-static unsigned int find_sec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings,
- const char *name)
+static unsigned int find_sec(const struct load_info *info, const char *name)
{
unsigned int i;
- for (i = 1; i < hdr->e_shnum; i++)
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
/* Alloc bit cleared means "ignore it." */
- if ((sechdrs[i].sh_flags & SHF_ALLOC)
- && strcmp(secstrings+sechdrs[i].sh_name, name) == 0)
+ if ((shdr->sh_flags & SHF_ALLOC)
+ && strcmp(info->secstrings + shdr->sh_name, name) == 0)
return i;
+ }
return 0;
}
/* Find a module section, or NULL. */
-static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
- const char *secstrings, const char *name)
+static void *section_addr(const struct load_info *info, const char *name)
{
/* Section 0 has sh_addr 0. */
- return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
+ return (void *)info->sechdrs[find_sec(info, name)].sh_addr;
}
/* Find a module section, or NULL. Fill in number of "objects" in section. */
-static void *section_objs(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings,
+static void *section_objs(const struct load_info *info,
const char *name,
size_t object_size,
unsigned int *num)
{
- unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
+ unsigned int sec = find_sec(info, name);
/* Section 0 has sh_addr 0 and sh_size 0. */
- *num = sechdrs[sec].sh_size / object_size;
- return (void *)sechdrs[sec].sh_addr;
+ *num = info->sechdrs[sec].sh_size / object_size;
+ return (void *)info->sechdrs[sec].sh_addr;
}
/* Provided by the linker */
@@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
unsigned int symnum, void *data), void *data)
{
struct module *mod;
- const struct symsearch arr[] = {
+ static const struct symsearch arr[] = {
{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
NOT_GPL_ONLY, false },
{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
@@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod,
mod->percpu = __alloc_reserved_percpu(size, align);
if (!mod->percpu) {
printk(KERN_WARNING
- "Could not allocate %lu bytes percpu data\n", size);
+ "%s: Could not allocate %lu bytes percpu data\n",
+ mod->name, size);
return -ENOMEM;
}
mod->percpu_size = size;
@@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod)
free_percpu(mod->percpu);
}
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static unsigned int find_pcpusec(struct load_info *info)
{
- return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
+ return find_sec(info, ".data..percpu");
}
static void percpu_modcopy(struct module *mod,
@@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod,
static inline void percpu_modfree(struct module *mod)
{
}
-static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static unsigned int find_pcpusec(struct load_info *info)
{
return 0;
}
@@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
EXPORT_TRACEPOINT_SYMBOL(module_get);
/* Init the unload section of the module. */
-static void module_unload_init(struct module *mod)
+static int module_unload_init(struct module *mod)
{
- int cpu;
+ mod->refptr = alloc_percpu(struct module_ref);
+ if (!mod->refptr)
+ return -ENOMEM;
INIT_LIST_HEAD(&mod->source_list);
INIT_LIST_HEAD(&mod->target_list);
- for_each_possible_cpu(cpu) {
- per_cpu_ptr(mod->refptr, cpu)->incs = 0;
- per_cpu_ptr(mod->refptr, cpu)->decs = 0;
- }
/* Hold reference count during initialization. */
__this_cpu_write(mod->refptr->incs, 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
+
+ return 0;
}
/* Does a already use b? */
@@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod)
kfree(use);
}
mutex_unlock(&module_mutex);
+
+ free_percpu(mod->refptr);
}
#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -891,8 +900,9 @@ int ref_module(struct module *a, struct module *b)
}
EXPORT_SYMBOL_GPL(ref_module);
-static inline void module_unload_init(struct module *mod)
+static inline int module_unload_init(struct module *mod)
{
+ return 0;
}
#endif /* CONFIG_MODULE_UNLOAD */
@@ -1051,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic,
#endif /* CONFIG_MODVERSIONS */
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
-static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static const struct kernel_symbol *resolve_symbol(struct module *mod,
+ const struct load_info *info,
const char *name,
- struct module *mod,
char ownername[])
{
struct module *owner;
@@ -1068,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
if (!sym)
goto unlock;
- if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
+ if (!check_version(info->sechdrs, info->index.vers, name, mod, crc,
+ owner)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
@@ -1087,21 +1097,20 @@ unlock:
return sym;
}
-static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
- unsigned int versindex,
- const char *name,
- struct module *mod)
+static const struct kernel_symbol *
+resolve_symbol_wait(struct module *mod,
+ const struct load_info *info,
+ const char *name)
{
const struct kernel_symbol *ksym;
- char ownername[MODULE_NAME_LEN];
+ char owner[MODULE_NAME_LEN];
if (wait_event_interruptible_timeout(module_wq,
- !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
- mod, ownername)) ||
- PTR_ERR(ksym) != -EBUSY,
+ !IS_ERR(ksym = resolve_symbol(mod, info, name, owner))
+ || PTR_ERR(ksym) != -EBUSY,
30 * HZ) <= 0) {
printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
- mod->name, ownername);
+ mod->name, owner);
}
return ksym;
}
@@ -1110,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
* /sys/module/foo/sections stuff
* J. Corbet <corbet@lwn.net>
*/
-#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
+#ifdef CONFIG_SYSFS
+#ifdef CONFIG_KALLSYMS
static inline bool sect_empty(const Elf_Shdr *sect)
{
return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
@@ -1148,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
kfree(sect_attrs);
}
-static void add_sect_attrs(struct module *mod, unsigned int nsect,
- char *secstrings, Elf_Shdr *sechdrs)
+static void add_sect_attrs(struct module *mod, const struct load_info *info)
{
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
@@ -1157,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
struct attribute **gattr;
/* Count loaded sections and allocate structures */
- for (i = 0; i < nsect; i++)
- if (!sect_empty(&sechdrs[i]))
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]))
nloaded++;
size[0] = ALIGN(sizeof(*sect_attrs)
+ nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1175,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
gattr = &sect_attrs->grp.attrs[0];
- for (i = 0; i < nsect; i++) {
- if (sect_empty(&sechdrs[i]))
+ for (i = 0; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *sec = &info->sechdrs[i];
+ if (sect_empty(sec))
continue;
- sattr->address = sechdrs[i].sh_addr;
- sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
+ sattr->address = sec->sh_addr;
+ sattr->name = kstrdup(info->secstrings + sec->sh_name,
GFP_KERNEL);
if (sattr->name == NULL)
goto out;
@@ -1247,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
kfree(notes_attrs);
}
-static void add_notes_attrs(struct module *mod, unsigned int nsect,
- char *secstrings, Elf_Shdr *sechdrs)
+static void add_notes_attrs(struct module *mod, const struct load_info *info)
{
unsigned int notes, loaded, i;
struct module_notes_attrs *notes_attrs;
@@ -1260,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
/* Count notes sections and allocate structures. */
notes = 0;
- for (i = 0; i < nsect; i++)
- if (!sect_empty(&sechdrs[i]) &&
- (sechdrs[i].sh_type == SHT_NOTE))
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ if (!sect_empty(&info->sechdrs[i]) &&
+ (info->sechdrs[i].sh_type == SHT_NOTE))
++notes;
if (notes == 0)
@@ -1276,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
notes_attrs->notes = notes;
nattr = &notes_attrs->attrs[0];
- for (loaded = i = 0; i < nsect; ++i) {
- if (sect_empty(&sechdrs[i]))
+ for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
+ if (sect_empty(&info->sechdrs[i]))
continue;
- if (sechdrs[i].sh_type == SHT_NOTE) {
+ if (info->sechdrs[i].sh_type == SHT_NOTE) {
sysfs_bin_attr_init(nattr);
nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
nattr->attr.mode = S_IRUGO;
- nattr->size = sechdrs[i].sh_size;
- nattr->private = (void *) sechdrs[i].sh_addr;
+ nattr->size = info->sechdrs[i].sh_size;
+ nattr->private = (void *) info->sechdrs[i].sh_addr;
nattr->read = module_notes_read;
++nattr;
}
@@ -1315,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod)
#else
-static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
- char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_sect_attrs(struct module *mod,
+ const struct load_info *info)
{
}
@@ -1324,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod)
{
}
-static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
- char *sectstrings, Elf_Shdr *sechdrs)
+static inline void add_notes_attrs(struct module *mod,
+ const struct load_info *info)
{
}
static inline void remove_notes_attrs(struct module *mod)
{
}
-#endif
+#endif /* CONFIG_KALLSYMS */
-#ifdef CONFIG_SYSFS
static void add_usage_links(struct module *mod)
{
#ifdef CONFIG_MODULE_UNLOAD
@@ -1439,6 +1447,7 @@ out:
}
static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
struct kernel_param *kparam,
unsigned int num_params)
{
@@ -1463,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod,
goto out_unreg_param;
add_usage_links(mod);
+ add_sect_attrs(mod, info);
+ add_notes_attrs(mod, info);
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
@@ -1479,33 +1490,26 @@ out:
static void mod_sysfs_fini(struct module *mod)
{
+ remove_notes_attrs(mod);
+ remove_sect_attrs(mod);
kobject_put(&mod->mkobj.kobj);
}
-#else /* CONFIG_SYSFS */
-
-static inline int mod_sysfs_init(struct module *mod)
-{
- return 0;
-}
+#else /* !CONFIG_SYSFS */
-static inline int mod_sysfs_setup(struct module *mod,
+static int mod_sysfs_setup(struct module *mod,
+ const struct load_info *info,
struct kernel_param *kparam,
unsigned int num_params)
{
return 0;
}
-static inline int module_add_modinfo_attrs(struct module *mod)
-{
- return 0;
-}
-
-static inline void module_remove_modinfo_attrs(struct module *mod)
+static void mod_sysfs_fini(struct module *mod)
{
}
-static void mod_sysfs_fini(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod)
{
}
@@ -1515,7 +1519,7 @@ static void del_usage_links(struct module *mod)
#endif /* CONFIG_SYSFS */
-static void mod_kobject_remove(struct module *mod)
+static void mod_sysfs_teardown(struct module *mod)
{
del_usage_links(mod);
module_remove_modinfo_attrs(mod);
@@ -1545,9 +1549,7 @@ static void free_module(struct module *mod)
mutex_lock(&module_mutex);
stop_machine(__unlink_module, mod, NULL);
mutex_unlock(&module_mutex);
- remove_notes_attrs(mod);
- remove_sect_attrs(mod);
- mod_kobject_remove(mod);
+ mod_sysfs_teardown(mod);
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1565,10 +1567,7 @@ static void free_module(struct module *mod)
module_free(mod, mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
- if (mod->refptr)
- free_percpu(mod->refptr);
-#endif
+
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -1634,25 +1633,23 @@ static int verify_export_symbols(struct module *mod)
}
/* Change all symbols so that st_value encodes the pointer directly. */
-static int simplify_symbols(Elf_Shdr *sechdrs,
- unsigned int symindex,
- const char *strtab,
- unsigned int versindex,
- unsigned int pcpuindex,
- struct module *mod)
-{
- Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
+static int simplify_symbols(struct module *mod, const struct load_info *info)
+{
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
+ Elf_Sym *sym = (void *)symsec->sh_addr;
unsigned long secbase;
- unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
+ unsigned int i;
int ret = 0;
const struct kernel_symbol *ksym;
- for (i = 1; i < n; i++) {
+ for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) {
+ const char *name = info->strtab + sym[i].st_name;
+
switch (sym[i].st_shndx) {
case SHN_COMMON:
/* We compiled with -fno-common. These are not
supposed to happen. */
- DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name);
+ DEBUGP("Common symbol: %s\n", name);
printk("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
@@ -1665,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
- ksym = resolve_symbol_wait(sechdrs, versindex,
- strtab + sym[i].st_name,
- mod);
+ ksym = resolve_symbol_wait(mod, info, name);
/* Ok if resolved. */
if (ksym && !IS_ERR(ksym)) {
sym[i].st_value = ksym->value;
@@ -1679,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
- mod->name, strtab + sym[i].st_name,
- PTR_ERR(ksym));
+ mod->name, name, PTR_ERR(ksym));
ret = PTR_ERR(ksym) ?: -ENOENT;
break;
default:
/* Divert to percpu allocation if a percpu var. */
- if (sym[i].st_shndx == pcpuindex)
+ if (sym[i].st_shndx == info->index.pcpu)
secbase = (unsigned long)mod_percpu(mod);
else
- secbase = sechdrs[sym[i].st_shndx].sh_addr;
+ secbase = info->sechdrs[sym[i].st_shndx].sh_addr;
sym[i].st_value += secbase;
break;
}
@@ -1698,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
return ret;
}
+static int apply_relocations(struct module *mod, const struct load_info *info)
+{
+ unsigned int i;
+ int err = 0;
+
+ /* Now do relocations. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ unsigned int infosec = info->sechdrs[i].sh_info;
+
+ /* Not a valid relocation section? */
+ if (infosec >= info->hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (info->sechdrs[i].sh_type == SHT_REL)
+ err = apply_relocate(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ else if (info->sechdrs[i].sh_type == SHT_RELA)
+ err = apply_relocate_add(info->sechdrs, info->strtab,
+ info->index.sym, i, mod);
+ if (err < 0)
+ break;
+ }
+ return err;
+}
+
/* Additional bytes needed by arch in front of individual sections */
unsigned int __weak arch_mod_section_prepend(struct module *mod,
unsigned int section)
@@ -1722,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size,
might -- code, read-only data, read-write data, small data. Tally
sizes, and place the offsets into sh_entsize fields: high bit means it
belongs in init. */
-static void layout_sections(struct module *mod,
- const Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs,
- const char *secstrings)
+static void layout_sections(struct module *mod, struct load_info *info)
{
static unsigned long const masks[][2] = {
/* NOTE: all executable code must be the first section
@@ -1738,21 +1758,22 @@ static void layout_sections(struct module *mod,
};
unsigned int m, i;
- for (i = 0; i < hdr->e_shnum; i++)
- sechdrs[i].sh_entsize = ~0UL;
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
DEBUGP("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < hdr->e_shnum; ++i) {
- Elf_Shdr *s = &sechdrs[i];
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || strstarts(secstrings + s->sh_name, ".init"))
+ || strstarts(sname, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
- DEBUGP("\t%s\n", secstrings + s->sh_name);
+ DEBUGP("\t%s\n", name);
}
if (m == 0)
mod->core_text_size = mod->core_size;
@@ -1760,17 +1781,18 @@ static void layout_sections(struct module *mod,
DEBUGP("Init section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < hdr->e_shnum; ++i) {
- Elf_Shdr *s = &sechdrs[i];
+ for (i = 0; i < info->hdr->e_shnum; ++i) {
+ Elf_Shdr *s = &info->sechdrs[i];
+ const char *sname = info->secstrings + s->sh_name;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || !strstarts(secstrings + s->sh_name, ".init"))
+ || !strstarts(sname, ".init"))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
- DEBUGP("\t%s\n", secstrings + s->sh_name);
+ DEBUGP("\t%s\n", sname);
}
if (m == 0)
mod->init_text_size = mod->init_size;
@@ -1809,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize)
return string;
}
-static char *get_modinfo(Elf_Shdr *sechdrs,
- unsigned int info,
- const char *tag)
+static char *get_modinfo(struct load_info *info, const char *tag)
{
char *p;
unsigned int taglen = strlen(tag);
- unsigned long size = sechdrs[info].sh_size;
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
- for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) {
+ for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
return NULL;
}
-static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
- unsigned int infoindex)
+static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (attr->setup)
- attr->setup(mod,
- get_modinfo(sechdrs,
- infoindex,
- attr->attr.name));
+ attr->setup(mod, get_modinfo(info, attr->attr.name));
}
}
@@ -1876,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value,
}
/* As per nm */
-static char elf_type(const Elf_Sym *sym,
- Elf_Shdr *sechdrs,
- const char *secstrings,
- struct module *mod)
+static char elf_type(const Elf_Sym *sym, const struct load_info *info)
{
+ const Elf_Shdr *sechdrs = info->sechdrs;
+
if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT)
return 'v';
@@ -1910,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym,
else
return 'b';
}
- if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
+ if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name,
+ ".debug")) {
return 'n';
+ }
return '?';
}
@@ -1936,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
return true;
}
-static unsigned long layout_symtab(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- unsigned int strindex,
- const Elf_Ehdr *hdr,
- const char *secstrings,
- unsigned long *pstroffs,
- unsigned long *strmap)
+static void layout_symtab(struct module *mod, struct load_info *info)
{
- unsigned long symoffs;
- Elf_Shdr *symsect = sechdrs + symindex;
- Elf_Shdr *strsect = sechdrs + strindex;
+ Elf_Shdr *symsect = info->sechdrs + info->index.sym;
+ Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- const char *strtab;
unsigned int i, nsrc, ndst;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
- symindex) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", secstrings + symsect->sh_name);
+ info->index.sym) | INIT_OFFSET_MASK;
+ DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
- src = (void *)hdr + symsect->sh_offset;
+ src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- strtab = (void *)hdr + strsect->sh_offset;
for (ndst = i = 1; i < nsrc; ++i, ++src)
- if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
+ if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
unsigned int j = src->st_name;
- while(!__test_and_set_bit(j, strmap) && strtab[j])
+ while (!__test_and_set_bit(j, info->strmap)
+ && info->strtab[j])
++j;
++ndst;
}
/* Append room for core symbols at end of core part. */
- symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
+ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
+ mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
- strindex) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", secstrings + strsect->sh_name);
+ info->index.str) | INIT_OFFSET_MASK;
+ DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
/* Append room for core symbols' strings at end of core part. */
- *pstroffs = mod->core_size;
- __set_bit(0, strmap);
- mod->core_size += bitmap_weight(strmap, strsect->sh_size);
-
- return symoffs;
+ info->stroffs = mod->core_size;
+ __set_bit(0, info->strmap);
+ mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
}
-static void add_kallsyms(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int shnum,
- unsigned int symindex,
- unsigned int strindex,
- unsigned long symoffs,
- unsigned long stroffs,
- const char *secstrings,
- unsigned long *strmap)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
const Elf_Sym *src;
Elf_Sym *dst;
char *s;
+ Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- mod->symtab = (void *)sechdrs[symindex].sh_addr;
- mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
- mod->strtab = (void *)sechdrs[strindex].sh_addr;
+ mod->symtab = (void *)symsec->sh_addr;
+ mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Make sure we get permanent strtab: don't use info->strtab. */
+ mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
/* Set types up while we still have access to sections. */
for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info
- = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
+ mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
- mod->core_symtab = dst = mod->module_core + symoffs;
+ mod->core_symtab = dst = mod->module_core + info->symoffs;
src = mod->symtab;
*dst = *src;
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
- if (!is_core_symbol(src, sechdrs, shnum))
+ if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
continue;
dst[ndst] = *src;
- dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
+ dst[ndst].st_name = bitmap_weight(info->strmap,
+ dst[ndst].st_name);
++ndst;
}
mod->core_num_syms = ndst;
- mod->core_strtab = s = mod->module_core + stroffs;
- for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
- if (test_bit(i, strmap))
+ mod->core_strtab = s = mod->module_core + info->stroffs;
+ for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
+ if (test_bit(i, info->strmap))
*++s = mod->strtab[i];
}
#else
-static inline unsigned long layout_symtab(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int symindex,
- unsigned int strindex,
- const Elf_Ehdr *hdr,
- const char *secstrings,
- unsigned long *pstroffs,
- unsigned long *strmap)
+static inline void layout_symtab(struct module *mod, struct load_info *info)
{
- return 0;
}
-static inline void add_kallsyms(struct module *mod,
- Elf_Shdr *sechdrs,
- unsigned int shnum,
- unsigned int symindex,
- unsigned int strindex,
- unsigned long symoffs,
- unsigned long stroffs,
- const char *secstrings,
- const unsigned long *strmap)
+static void add_kallsyms(struct module *mod, struct load_info *info)
{
}
#endif /* CONFIG_KALLSYMS */
static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
+ if (!debug)
+ return;
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
printk(KERN_ERR "dynamic debug error adding module: %s\n",
@@ -2087,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size)
}
#ifdef CONFIG_DEBUG_KMEMLEAK
-static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs, char *secstrings)
+static void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
{
unsigned int i;
/* only scan the sections containing data */
kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
- for (i = 1; i < hdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ const char *name = info->secstrings + info->sechdrs[i].sh_name;
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
continue;
- if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
- && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+ if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
continue;
- kmemleak_scan_area((void *)sechdrs[i].sh_addr,
- sechdrs[i].sh_size, GFP_KERNEL);
+ kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
+ info->sechdrs[i].sh_size, GFP_KERNEL);
}
}
#else
-static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
- Elf_Shdr *sechdrs, char *secstrings)
+static inline void kmemleak_load_module(const struct module *mod,
+ const struct load_info *info)
{
}
#endif
-/* Allocate and load the module: note that size of section 0 is always
- zero, and we rely on this for optional sections. */
-static noinline struct module *load_module(void __user *umod,
- unsigned long len,
- const char __user *uargs)
+/* Sets info->hdr and info->len. */
+static int copy_and_check(struct load_info *info,
+ const void __user *umod, unsigned long len,
+ const char __user *uargs)
{
+ int err;
Elf_Ehdr *hdr;
- Elf_Shdr *sechdrs;
- char *secstrings, *args, *modmagic, *strtab = NULL;
- char *staging;
- unsigned int i;
- unsigned int symindex = 0;
- unsigned int strindex = 0;
- unsigned int modindex, versindex, infoindex, pcpuindex;
- struct module *mod;
- long err = 0;
- void *ptr = NULL; /* Stops spurious gcc warning */
- unsigned long symoffs, stroffs, *strmap;
- void __percpu *percpu;
- struct _ddebug *debug = NULL;
- unsigned int num_debug = 0;
- mm_segment_t old_fs;
-
- DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
if (len < sizeof(*hdr))
- return ERR_PTR(-ENOEXEC);
+ return -ENOEXEC;
/* Suck in entire file: we'll want most of it. */
/* vmalloc barfs on "unusual" numbers. Check here */
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
@@ -2153,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod,
}
/* Sanity checks against insmoding binaries or wrong arch,
- weird elf version */
+ weird elf version */
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
|| hdr->e_type != ET_REL
|| !elf_check_arch(hdr)
- || hdr->e_shentsize != sizeof(*sechdrs)) {
+ || hdr->e_shentsize != sizeof(Elf_Shdr)) {
err = -ENOEXEC;
goto free_hdr;
}
- if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr))
- goto truncated;
+ if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) {
+ err = -ENOEXEC;
+ goto free_hdr;
+ }
- /* Convenience variables */
- sechdrs = (void *)hdr + hdr->e_shoff;
- secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- sechdrs[0].sh_addr = 0;
+ info->hdr = hdr;
+ info->len = len;
+ return 0;
- for (i = 1; i < hdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_NOBITS
- && len < sechdrs[i].sh_offset + sechdrs[i].sh_size)
- goto truncated;
+free_hdr:
+ vfree(hdr);
+ return err;
+}
+
+static void free_copy(struct load_info *info)
+{
+ vfree(info->hdr);
+}
+
+static int rewrite_section_headers(struct load_info *info)
+{
+ unsigned int i;
+
+ /* This should always be true, but let's be sure. */
+ info->sechdrs[0].sh_addr = 0;
+
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &info->sechdrs[i];
+ if (shdr->sh_type != SHT_NOBITS
+ && info->len < shdr->sh_offset + shdr->sh_size) {
+ printk(KERN_ERR "Module len %lu truncated\n",
+ info->len);
+ return -ENOEXEC;
+ }
/* Mark all sections sh_addr with their address in the
temporary image. */
- sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset;
+ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset;
- /* Internal symbols and strings. */
- if (sechdrs[i].sh_type == SHT_SYMTAB) {
- symindex = i;
- strindex = sechdrs[i].sh_link;
- strtab = (char *)hdr + sechdrs[strindex].sh_offset;
- }
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
- if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
- sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+ shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
- modindex = find_sec(hdr, sechdrs, secstrings,
- ".gnu.linkonce.this_module");
- if (!modindex) {
+ /* Track but don't keep modinfo and version sections. */
+ info->index.vers = find_sec(info, "__versions");
+ info->index.info = find_sec(info, ".modinfo");
+ info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ return 0;
+}
+
+/*
+ * Set up our basic convenience variables (pointers to section headers,
+ * search for module section index etc), and do some basic section
+ * verification.
+ *
+ * Return the temporary module pointer (we'll replace it with the final
+ * one when we move the module sections around).
+ */
+static struct module *setup_load_info(struct load_info *info)
+{
+ unsigned int i;
+ int err;
+ struct module *mod;
+
+ /* Set up the convenience variables */
+ info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
+ info->secstrings = (void *)info->hdr
+ + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
+
+ err = rewrite_section_headers(info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Find internal symbols and strings. */
+ for (i = 1; i < info->hdr->e_shnum; i++) {
+ if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
+ info->index.sym = i;
+ info->index.str = info->sechdrs[i].sh_link;
+ info->strtab = (char *)info->hdr
+ + info->sechdrs[info->index.str].sh_offset;
+ break;
+ }
+ }
+
+ info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
+ if (!info->index.mod) {
printk(KERN_WARNING "No module found in object\n");
- err = -ENOEXEC;
- goto free_hdr;
+ return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
- mod = (void *)sechdrs[modindex].sh_addr;
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
- if (symindex == 0) {
+ if (info->index.sym == 0) {
printk(KERN_WARNING "%s: module has no symbols (stripped?)\n",
mod->name);
- err = -ENOEXEC;
- goto free_hdr;
+ return ERR_PTR(-ENOEXEC);
}
- versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
- infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
- pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
-
- /* Don't keep modinfo and version sections. */
- sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
- sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->index.pcpu = find_pcpusec(info);
/* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(sechdrs, versindex, mod)) {
- err = -ENOEXEC;
- goto free_hdr;
- }
+ if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+ return ERR_PTR(-ENOEXEC);
+
+ return mod;
+}
+
+static int check_modinfo(struct module *mod, struct load_info *info)
+{
+ const char *modmagic = get_modinfo(info, "vermagic");
+ int err;
- modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "bad vermagic");
if (err)
- goto free_hdr;
- } else if (!same_magic(modmagic, vermagic, versindex)) {
+ return err;
+ } else if (!same_magic(modmagic, vermagic, info->index.vers)) {
printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
- err = -ENOEXEC;
- goto free_hdr;
+ return -ENOEXEC;
}
- staging = get_modinfo(sechdrs, infoindex, "staging");
- if (staging) {
+ if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP);
printk(KERN_WARNING "%s: module is from the staging directory,"
" the quality is unknown, you have been warned.\n",
mod->name);
}
- /* Now copy in args */
- args = strndup_user(uargs, ~0UL >> 1);
- if (IS_ERR(args)) {
- err = PTR_ERR(args);
- goto free_hdr;
- }
+ /* Set up license info based on the info section */
+ set_license(mod, get_modinfo(info, "license"));
- strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
- * sizeof(long), GFP_KERNEL);
- if (!strmap) {
- err = -ENOMEM;
- goto free_mod;
- }
+ return 0;
+}
- mod->state = MODULE_STATE_COMING;
+static void find_module_sections(struct module *mod, struct load_info *info)
+{
+ mod->kp = section_objs(info, "__param",
+ sizeof(*mod->kp), &mod->num_kp);
+ mod->syms = section_objs(info, "__ksymtab",
+ sizeof(*mod->syms), &mod->num_syms);
+ mod->crcs = section_addr(info, "__kcrctab");
+ mod->gpl_syms = section_objs(info, "__ksymtab_gpl",
+ sizeof(*mod->gpl_syms),
+ &mod->num_gpl_syms);
+ mod->gpl_crcs = section_addr(info, "__kcrctab_gpl");
+ mod->gpl_future_syms = section_objs(info,
+ "__ksymtab_gpl_future",
+ sizeof(*mod->gpl_future_syms),
+ &mod->num_gpl_future_syms);
+ mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future");
- /* Allow arches to frob section contents and sizes. */
- err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod);
- if (err < 0)
- goto free_mod;
+#ifdef CONFIG_UNUSED_SYMBOLS
+ mod->unused_syms = section_objs(info, "__ksymtab_unused",
+ sizeof(*mod->unused_syms),
+ &mod->num_unused_syms);
+ mod->unused_crcs = section_addr(info, "__kcrctab_unused");
+ mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl",
+ sizeof(*mod->unused_gpl_syms),
+ &mod->num_unused_gpl_syms);
+ mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
+#endif
+#ifdef CONFIG_CONSTRUCTORS
+ mod->ctors = section_objs(info, ".ctors",
+ sizeof(*mod->ctors), &mod->num_ctors);
+#endif
- if (pcpuindex) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
- sechdrs[pcpuindex].sh_addralign);
- if (err)
- goto free_mod;
- sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
- /* Keep this around for failure path. */
- percpu = mod_percpu(mod);
+#ifdef CONFIG_TRACEPOINTS
+ mod->tracepoints = section_objs(info, "__tracepoints",
+ sizeof(*mod->tracepoints),
+ &mod->num_tracepoints);
+#endif
+#ifdef CONFIG_EVENT_TRACING
+ mod->trace_events = section_objs(info, "_ftrace_events",
+ sizeof(*mod->trace_events),
+ &mod->num_trace_events);
+ /*
+ * This section contains pointers to allocated objects in the trace
+ * code and not scanning it leads to false positives.
+ */
+ kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
+ mod->num_trace_events, GFP_KERNEL);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ /* sechdrs[0].sh_size is always zero */
+ mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ sizeof(*mod->ftrace_callsites),
+ &mod->num_ftrace_callsites);
+#endif
- /* Determine total sizes, and put offsets in sh_entsize. For now
- this is done generically; there doesn't appear to be any
- special cases for the architectures. */
- layout_sections(mod, hdr, sechdrs, secstrings);
- symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
- secstrings, &stroffs, strmap);
+ mod->extable = section_objs(info, "__ex_table",
+ sizeof(*mod->extable), &mod->num_exentries);
+
+ if (section_addr(info, "__obsparm"))
+ printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
+ mod->name);
+
+ info->debug = section_objs(info, "__verbose",
+ sizeof(*info->debug), &info->num_debug);
+}
+
+static int move_module(struct module *mod, struct load_info *info)
+{
+ int i;
+ void *ptr;
/* Do the allocs. */
ptr = module_alloc_update_bounds(mod->core_size);
@@ -2291,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod,
* leak.
*/
kmemleak_not_leak(ptr);
- if (!ptr) {
- err = -ENOMEM;
- goto free_percpu;
- }
+ if (!ptr)
+ return -ENOMEM;
+
memset(ptr, 0, mod->core_size);
mod->module_core = ptr;
@@ -2307,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod,
*/
kmemleak_ignore(ptr);
if (!ptr && mod->init_size) {
- err = -ENOMEM;
- goto free_core;
+ module_free(mod, mod->module_core);
+ return -ENOMEM;
}
memset(ptr, 0, mod->init_size);
mod->module_init = ptr;
/* Transfer each section which specifies SHF_ALLOC */
DEBUGP("final section addresses:\n");
- for (i = 0; i < hdr->e_shnum; i++) {
+ for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
+ Elf_Shdr *shdr = &info->sechdrs[i];
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK)
+ if (shdr->sh_entsize & INIT_OFFSET_MASK)
dest = mod->module_init
- + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK);
+ + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
else
- dest = mod->module_core + sechdrs[i].sh_entsize;
+ dest = mod->module_core + shdr->sh_entsize;
- if (sechdrs[i].sh_type != SHT_NOBITS)
- memcpy(dest, (void *)sechdrs[i].sh_addr,
- sechdrs[i].sh_size);
+ if (shdr->sh_type != SHT_NOBITS)
+ memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
/* Update sh_addr to point to copy in image. */
- sechdrs[i].sh_addr = (unsigned long)dest;
- DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name);
- }
- /* Module has been moved. */
- mod = (void *)sechdrs[modindex].sh_addr;
- kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-
-#if defined(CONFIG_MODULE_UNLOAD)
- mod->refptr = alloc_percpu(struct module_ref);
- if (!mod->refptr) {
- err = -ENOMEM;
- goto free_init;
+ shdr->sh_addr = (unsigned long)dest;
+ DEBUGP("\t0x%lx %s\n",
+ shdr->sh_addr, info->secstrings + shdr->sh_name);
}
-#endif
- /* Now we've moved module, initialize linked lists, etc. */
- module_unload_init(mod);
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+ return 0;
+}
+static int check_module_license_and_versions(struct module *mod)
+{
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -2363,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod,
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
- /* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, sechdrs, infoindex);
-
- /* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
- mod);
- if (err < 0)
- goto cleanup;
-
- /* Now we've got everything in the final locations, we can
- * find optional sections. */
- mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
- sizeof(*mod->kp), &mod->num_kp);
- mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
- sizeof(*mod->syms), &mod->num_syms);
- mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
- mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
- sizeof(*mod->gpl_syms),
- &mod->num_gpl_syms);
- mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
- mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_gpl_future",
- sizeof(*mod->gpl_future_syms),
- &mod->num_gpl_future_syms);
- mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_gpl_future");
-
-#ifdef CONFIG_UNUSED_SYMBOLS
- mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_unused",
- sizeof(*mod->unused_syms),
- &mod->num_unused_syms);
- mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_unused");
- mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
- "__ksymtab_unused_gpl",
- sizeof(*mod->unused_gpl_syms),
- &mod->num_unused_gpl_syms);
- mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
- "__kcrctab_unused_gpl");
-#endif
-#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
- sizeof(*mod->ctors), &mod->num_ctors);
-#endif
-
-#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
- "__tracepoints",
- sizeof(*mod->tracepoints),
- &mod->num_tracepoints);
-#endif
-#ifdef CONFIG_EVENT_TRACING
- mod->trace_events = section_objs(hdr, sechdrs, secstrings,
- "_ftrace_events",
- sizeof(*mod->trace_events),
- &mod->num_trace_events);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
- mod->num_trace_events, GFP_KERNEL);
-#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
- /* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
- "__mcount_loc",
- sizeof(*mod->ftrace_callsites),
- &mod->num_ftrace_callsites);
-#endif
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2443,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod,
|| (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
- err = try_to_force_load(mod,
- "no versions for exported symbols");
- if (err)
- goto cleanup;
+ return try_to_force_load(mod,
+ "no versions for exported symbols");
}
#endif
+ return 0;
+}
- /* Now do relocations. */
- for (i = 1; i < hdr->e_shnum; i++) {
- const char *strtab = (char *)sechdrs[strindex].sh_addr;
- unsigned int info = sechdrs[i].sh_info;
-
- /* Not a valid relocation section? */
- if (info >= hdr->e_shnum)
- continue;
-
- /* Don't bother with non-allocated sections */
- if (!(sechdrs[info].sh_flags & SHF_ALLOC))
- continue;
-
- if (sechdrs[i].sh_type == SHT_REL)
- err = apply_relocate(sechdrs, strtab, symindex, i,mod);
- else if (sechdrs[i].sh_type == SHT_RELA)
- err = apply_relocate_add(sechdrs, strtab, symindex, i,
- mod);
- if (err < 0)
- goto cleanup;
- }
-
- /* Set up and sort exception table */
- mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
- sizeof(*mod->extable), &mod->num_exentries);
- sort_extable(mod->extable, mod->extable + mod->num_exentries);
-
- /* Finally, copy percpu area over. */
- percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
- sechdrs[pcpuindex].sh_size);
-
- add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
- symoffs, stroffs, secstrings, strmap);
- kfree(strmap);
- strmap = NULL;
-
- if (!mod->taints)
- debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
- sizeof(*debug), &num_debug);
-
- err = module_finalize(hdr, sechdrs, mod);
- if (err < 0)
- goto cleanup;
+static void flush_module_icache(const struct module *mod)
+{
+ mm_segment_t old_fs;
/* flush the icache in correct context */
old_fs = get_fs();
@@ -2511,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod,
(unsigned long)mod->module_core + mod->core_size);
set_fs(old_fs);
+}
- mod->args = args;
- if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
- printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
- mod->name);
+static struct module *layout_and_allocate(struct load_info *info)
+{
+ /* Module within temporary copy. */
+ struct module *mod;
+ Elf_Shdr *pcpusec;
+ int err;
+
+ mod = setup_load_info(info);
+ if (IS_ERR(mod))
+ return mod;
+
+ err = check_modinfo(mod, info);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Allow arches to frob section contents and sizes. */
+ err = module_frob_arch_sections(info->hdr, info->sechdrs,
+ info->secstrings, mod);
+ if (err < 0)
+ goto out;
+
+ pcpusec = &info->sechdrs[info->index.pcpu];
+ if (pcpusec->sh_size) {
+ /* We have a special allocation for this section. */
+ err = percpu_modalloc(mod,
+ pcpusec->sh_size, pcpusec->sh_addralign);
+ if (err)
+ goto out;
+ pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
+ }
+
+ /* Determine total sizes, and put offsets in sh_entsize. For now
+ this is done generically; there doesn't appear to be any
+ special cases for the architectures. */
+ layout_sections(mod, info);
+
+ info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
+ * sizeof(long), GFP_KERNEL);
+ if (!info->strmap) {
+ err = -ENOMEM;
+ goto free_percpu;
+ }
+ layout_symtab(mod, info);
+
+ /* Allocate and move to the final place */
+ err = move_module(mod, info);
+ if (err)
+ goto free_strmap;
+
+ /* Module has been copied to its final place now: return it. */
+ mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ kmemleak_load_module(mod, info);
+ return mod;
+
+free_strmap:
+ kfree(info->strmap);
+free_percpu:
+ percpu_modfree(mod);
+out:
+ return ERR_PTR(err);
+}
+
+/* mod is no longer valid after this! */
+static void module_deallocate(struct module *mod, struct load_info *info)
+{
+ kfree(info->strmap);
+ percpu_modfree(mod);
+ module_free(mod, mod->module_init);
+ module_free(mod, mod->module_core);
+}
+
+static int post_relocation(struct module *mod, const struct load_info *info)
+{
+ /* Sort exception table now relocations are done. */
+ sort_extable(mod->extable, mod->extable + mod->num_exentries);
+
+ /* Copy relocated percpu area over. */
+ percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr,
+ info->sechdrs[info->index.pcpu].sh_size);
+
+ /* Setup kallsyms-specific fields. */
+ add_kallsyms(mod, info);
+
+ /* Arch-specific module finalizing. */
+ return module_finalize(info->hdr, info->sechdrs, mod);
+}
+
+/* Allocate and load the module: note that size of section 0 is always
+ zero, and we rely on this for optional sections. */
+static struct module *load_module(void __user *umod,
+ unsigned long len,
+ const char __user *uargs)
+{
+ struct load_info info = { NULL, };
+ struct module *mod;
+ long err;
+
+ DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
+
+ /* Copy in the blobs from userspace, check they are vaguely sane. */
+ err = copy_and_check(&info, umod, len, uargs);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Figure out module layout, and allocate all the memory. */
+ mod = layout_and_allocate(&info);
+ if (IS_ERR(mod)) {
+ err = PTR_ERR(mod);
+ goto free_copy;
+ }
+
+ /* Now module is in final location, initialize linked lists, etc. */
+ err = module_unload_init(mod);
+ if (err)
+ goto free_module;
+
+ /* Now we've got everything in the final locations, we can
+ * find optional sections. */
+ find_module_sections(mod, &info);
+
+ err = check_module_license_and_versions(mod);
+ if (err)
+ goto free_unload;
+
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, &info);
+
+ /* Fix up syms, so that st_value is a pointer to location. */
+ err = simplify_symbols(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = apply_relocations(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ err = post_relocation(mod, &info);
+ if (err < 0)
+ goto free_modinfo;
+
+ flush_module_icache(mod);
+
+ /* Now copy in args */
+ mod->args = strndup_user(uargs, ~0UL >> 1);
+ if (IS_ERR(mod->args)) {
+ err = PTR_ERR(mod->args);
+ goto free_arch_cleanup;
+ }
+
+ /* Mark state as coming so strong_try_module_get() ignores us. */
+ mod->state = MODULE_STATE_COMING;
/* Now sew it into the lists so we can get lockdep and oops
* info during argument parsing. Noone should access us, since
@@ -2530,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod,
goto unlock;
}
- if (debug)
- dynamic_debug_setup(debug, num_debug);
+ /* This has to be done once we're sure module name is unique. */
+ if (!mod->taints)
+ dynamic_debug_setup(info.debug, info.num_debug);
/* Find duplicate symbols */
err = verify_export_symbols(mod);
@@ -2541,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod,
list_add_rcu(&mod->list, &modules);
mutex_unlock(&module_mutex);
+ /* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
goto unlink;
- err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
+ /* Link in to syfs. */
+ err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
- add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
- add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
-
- /* Get rid of temporary copy */
- vfree(hdr);
-
- trace_module_load(mod);
+ /* Get rid of temporary copy and strmap. */
+ kfree(info.strmap);
+ free_copy(&info);
/* Done! */
+ trace_module_load(mod);
return mod;
unlink:
@@ -2565,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod,
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
ddebug:
- dynamic_debug_remove(debug);
+ if (!mod->taints)
+ dynamic_debug_remove(info.debug);
unlock:
mutex_unlock(&module_mutex);
synchronize_sched();
+ kfree(mod->args);
+ free_arch_cleanup:
module_arch_cleanup(mod);
- cleanup:
+ free_modinfo:
free_modinfo(mod);
+ free_unload:
module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD)
- free_percpu(mod->refptr);
- free_init:
-#endif
- module_free(mod, mod->module_init);
- free_core:
- module_free(mod, mod->module_core);
- /* mod will be freed with core. Don't access it beyond this line! */
- free_percpu:
- free_percpu(percpu);
- free_mod:
- kfree(args);
- kfree(strmap);
- free_hdr:
- vfree(hdr);
+ free_module:
+ module_deallocate(mod, &info);
+ free_copy:
+ free_copy(&info);
return ERR_PTR(err);
-
- truncated:
- printk(KERN_ERR "Module len %lu truncated\n", len);
- err = -ENOEXEC;
- goto free_hdr;
}
/* Call module constructors. */
diff --git a/kernel/padata.c b/kernel/padata.c
index fdd8ae609ce3..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -26,18 +26,19 @@
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <linux/rcupdate.h>
-#define MAX_SEQ_NR INT_MAX - NR_CPUS
+#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
#define MAX_OBJ_NUM 1000
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
- target_cpu = cpumask_first(pd->cpumask);
+ target_cpu = cpumask_first(pd->cpumask.pcpu);
for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask);
+ target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
return target_cpu;
}
@@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
- cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
+ cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
return padata_index_to_cpu(pd, cpu_index);
}
-static void padata_parallel_worker(struct work_struct *work)
+static void padata_parallel_worker(struct work_struct *parallel_work)
{
- struct padata_queue *queue;
+ struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
- queue = container_of(work, struct padata_queue, pwork);
- pd = queue->pd;
+ pqueue = container_of(parallel_work,
+ struct padata_parallel_queue, work);
+ pd = pqueue->pd;
pinst = pd->pinst;
- spin_lock(&queue->parallel.lock);
- list_replace_init(&queue->parallel.list, &local_list);
- spin_unlock(&queue->parallel.lock);
+ spin_lock(&pqueue->parallel.lock);
+ list_replace_init(&pqueue->parallel.list, &local_list);
+ spin_unlock(&pqueue->parallel.lock);
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work)
* @pinst: padata instance
* @padata: object to be parallelized
* @cb_cpu: cpu the serialization callback function will run on,
- * must be in the cpumask of padata.
+ * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
*
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
@@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
struct padata_priv *padata, int cb_cpu)
{
int target_cpu, err;
- struct padata_queue *queue;
+ struct padata_parallel_queue *queue;
struct parallel_data *pd;
rcu_read_lock_bh();
pd = rcu_dereference(pinst->pd);
- err = 0;
- if (!(pinst->flags & PADATA_INIT))
+ err = -EINVAL;
+ if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
+ goto out;
+
+ if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
goto out;
err = -EBUSY;
@@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
goto out;
- err = -EINVAL;
- if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
- goto out;
-
- err = -EINPROGRESS;
+ err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = cb_cpu;
@@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->seq_nr = atomic_inc_return(&pd->seq_nr);
target_cpu = padata_cpu_hash(padata);
- queue = per_cpu_ptr(pd->queue, target_cpu);
+ queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
list_add_tail(&padata->list, &queue->parallel.list);
spin_unlock(&queue->parallel.lock);
- queue_work_on(target_cpu, pinst->wq, &queue->pwork);
+ queue_work_on(target_cpu, pinst->wq, &queue->work);
out:
rcu_read_unlock_bh();
@@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel);
*/
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
- int cpu, num_cpus, empty, calc_seq_nr;
- int seq_nr, next_nr, overrun, next_overrun;
- struct padata_queue *queue, *next_queue;
+ int cpu, num_cpus;
+ int next_nr, next_index;
+ struct padata_parallel_queue *queue, *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
- empty = 0;
- next_nr = -1;
- next_overrun = 0;
- next_queue = NULL;
-
- num_cpus = cpumask_weight(pd->cpumask);
-
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- reorder = &queue->reorder;
-
- /*
- * Calculate the seq_nr of the object that should be
- * next in this reorder queue.
- */
- overrun = 0;
- calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
- + queue->cpu_index;
+ num_cpus = cpumask_weight(pd->cpumask.pcpu);
- if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
- calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
- overrun = 1;
- }
-
- if (!list_empty(&reorder->list)) {
- padata = list_entry(reorder->list.next,
- struct padata_priv, list);
-
- seq_nr = padata->seq_nr;
- BUG_ON(calc_seq_nr != seq_nr);
- } else {
- seq_nr = calc_seq_nr;
- empty++;
- }
-
- if (next_nr < 0 || seq_nr < next_nr
- || (next_overrun && !overrun)) {
- next_nr = seq_nr;
- next_overrun = overrun;
- next_queue = queue;
- }
+ /*
+ * Calculate the percpu reorder queue and the sequence
+ * number of the next object.
+ */
+ next_nr = pd->processed;
+ next_index = next_nr % num_cpus;
+ cpu = padata_index_to_cpu(pd, next_index);
+ next_queue = per_cpu_ptr(pd->pqueue, cpu);
+
+ if (unlikely(next_nr > pd->max_seq_nr)) {
+ next_nr = next_nr - pd->max_seq_nr - 1;
+ next_index = next_nr % num_cpus;
+ cpu = padata_index_to_cpu(pd, next_index);
+ next_queue = per_cpu_ptr(pd->pqueue, cpu);
+ pd->processed = 0;
}
padata = NULL;
- if (empty == num_cpus)
- goto out;
-
reorder = &next_queue->reorder;
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- if (unlikely(next_overrun)) {
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- atomic_set(&queue->num_obj, 0);
- }
- }
+ BUG_ON(next_nr != padata->seq_nr);
spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
spin_unlock(&reorder->lock);
- atomic_inc(&next_queue->num_obj);
+ pd->processed++;
goto out;
}
- queue = per_cpu_ptr(pd->queue, smp_processor_id());
+ queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
if (queue->cpu_index == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
goto out;
@@ -262,7 +231,7 @@ out:
static void padata_reorder(struct parallel_data *pd)
{
struct padata_priv *padata;
- struct padata_queue *queue;
+ struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
/*
@@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd)
return;
}
- queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
+ squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
- spin_lock(&queue->serial.lock);
- list_add_tail(&padata->list, &queue->serial.list);
- spin_unlock(&queue->serial.lock);
+ spin_lock(&squeue->serial.lock);
+ list_add_tail(&padata->list, &squeue->serial.list);
+ spin_unlock(&squeue->serial.lock);
- queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
+ queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
}
spin_unlock_bh(&pd->lock);
@@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg)
padata_reorder(pd);
}
-static void padata_serial_worker(struct work_struct *work)
+static void padata_serial_worker(struct work_struct *serial_work)
{
- struct padata_queue *queue;
+ struct padata_serial_queue *squeue;
struct parallel_data *pd;
LIST_HEAD(local_list);
local_bh_disable();
- queue = container_of(work, struct padata_queue, swork);
- pd = queue->pd;
+ squeue = container_of(serial_work, struct padata_serial_queue, work);
+ pd = squeue->pd;
- spin_lock(&queue->serial.lock);
- list_replace_init(&queue->serial.list, &local_list);
- spin_unlock(&queue->serial.lock);
+ spin_lock(&squeue->serial.lock);
+ list_replace_init(&squeue->serial.list, &local_list);
+ spin_unlock(&squeue->serial.lock);
while (!list_empty(&local_list)) {
struct padata_priv *padata;
@@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
void padata_do_serial(struct padata_priv *padata)
{
int cpu;
- struct padata_queue *queue;
+ struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
pd = padata->pd;
cpu = get_cpu();
- queue = per_cpu_ptr(pd->queue, cpu);
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
- spin_lock(&queue->reorder.lock);
+ spin_lock(&pqueue->reorder.lock);
atomic_inc(&pd->reorder_objects);
- list_add_tail(&padata->list, &queue->reorder.list);
- spin_unlock(&queue->reorder.lock);
+ list_add_tail(&padata->list, &pqueue->reorder.list);
+ spin_unlock(&pqueue->reorder.lock);
put_cpu();
@@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata)
}
EXPORT_SYMBOL(padata_do_serial);
-/* Allocate and initialize the internal cpumask dependend resources. */
-static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
- const struct cpumask *cpumask)
+static int padata_setup_cpumasks(struct parallel_data *pd,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
{
- int cpu, cpu_index, num_cpus;
- struct padata_queue *queue;
- struct parallel_data *pd;
-
- cpu_index = 0;
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+ return -ENOMEM;
- pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
- if (!pd)
- goto err;
+ cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pd->cpumask.cbcpu);
+ return -ENOMEM;
+ }
- pd->queue = alloc_percpu(struct padata_queue);
- if (!pd->queue)
- goto err_free_pd;
+ cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
+ return 0;
+}
- if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
- goto err_free_queue;
+static void __padata_list_init(struct padata_list *pd_list)
+{
+ INIT_LIST_HEAD(&pd_list->list);
+ spin_lock_init(&pd_list->lock);
+}
- cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+/* Initialize all percpu queues used by serial workers */
+static void padata_init_squeues(struct parallel_data *pd)
+{
+ int cpu;
+ struct padata_serial_queue *squeue;
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ squeue->pd = pd;
+ __padata_list_init(&squeue->serial);
+ INIT_WORK(&squeue->work, padata_serial_worker);
+ }
+}
- queue->pd = pd;
+/* Initialize all percpu queues used by parallel workers */
+static void padata_init_pqueues(struct parallel_data *pd)
+{
+ int cpu_index, num_cpus, cpu;
+ struct padata_parallel_queue *pqueue;
- queue->cpu_index = cpu_index;
+ cpu_index = 0;
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ pqueue->pd = pd;
+ pqueue->cpu_index = cpu_index;
cpu_index++;
- INIT_LIST_HEAD(&queue->reorder.list);
- INIT_LIST_HEAD(&queue->parallel.list);
- INIT_LIST_HEAD(&queue->serial.list);
- spin_lock_init(&queue->reorder.lock);
- spin_lock_init(&queue->parallel.lock);
- spin_lock_init(&queue->serial.lock);
-
- INIT_WORK(&queue->pwork, padata_parallel_worker);
- INIT_WORK(&queue->swork, padata_serial_worker);
- atomic_set(&queue->num_obj, 0);
+ __padata_list_init(&pqueue->reorder);
+ __padata_list_init(&pqueue->parallel);
+ INIT_WORK(&pqueue->work, padata_parallel_worker);
+ atomic_set(&pqueue->num_obj, 0);
}
- num_cpus = cpumask_weight(pd->cpumask);
- pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
+ num_cpus = cpumask_weight(pd->cpumask.pcpu);
+ pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
+}
+
+/* Allocate and initialize the internal cpumask dependend resources. */
+static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
+{
+ struct parallel_data *pd;
+ pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
+ if (!pd)
+ goto err;
+
+ pd->pqueue = alloc_percpu(struct padata_parallel_queue);
+ if (!pd->pqueue)
+ goto err_free_pd;
+
+ pd->squeue = alloc_percpu(struct padata_serial_queue);
+ if (!pd->squeue)
+ goto err_free_pqueue;
+ if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
+ goto err_free_squeue;
+
+ padata_init_pqueues(pd);
+ padata_init_squeues(pd);
setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
@@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
return pd;
-err_free_queue:
- free_percpu(pd->queue);
+err_free_squeue:
+ free_percpu(pd->squeue);
+err_free_pqueue:
+ free_percpu(pd->pqueue);
err_free_pd:
kfree(pd);
err:
@@ -456,8 +464,10 @@ err:
static void padata_free_pd(struct parallel_data *pd)
{
- free_cpumask_var(pd->cpumask);
- free_percpu(pd->queue);
+ free_cpumask_var(pd->cpumask.pcpu);
+ free_cpumask_var(pd->cpumask.cbcpu);
+ free_percpu(pd->pqueue);
+ free_percpu(pd->squeue);
kfree(pd);
}
@@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd)
static void padata_flush_queues(struct parallel_data *pd)
{
int cpu;
- struct padata_queue *queue;
+ struct padata_parallel_queue *pqueue;
+ struct padata_serial_queue *squeue;
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- flush_work(&queue->pwork);
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
+ pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ flush_work(&pqueue->work);
}
del_timer_sync(&pd->timer);
@@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd)
if (atomic_read(&pd->reorder_objects))
padata_reorder(pd);
- for_each_cpu(cpu, pd->cpumask) {
- queue = per_cpu_ptr(pd->queue, cpu);
- flush_work(&queue->swork);
+ for_each_cpu(cpu, pd->cpumask.cbcpu) {
+ squeue = per_cpu_ptr(pd->squeue, cpu);
+ flush_work(&squeue->work);
}
BUG_ON(atomic_read(&pd->refcnt) != 0);
}
+static void __padata_start(struct padata_instance *pinst)
+{
+ pinst->flags |= PADATA_INIT;
+}
+
+static void __padata_stop(struct padata_instance *pinst)
+{
+ if (!(pinst->flags & PADATA_INIT))
+ return;
+
+ pinst->flags &= ~PADATA_INIT;
+
+ synchronize_rcu();
+
+ get_online_cpus();
+ padata_flush_queues(pinst->pd);
+ put_online_cpus();
+}
+
/* Replace the internal control stucture with a new one. */
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
struct parallel_data *pd_old = pinst->pd;
+ int notification_mask = 0;
pinst->flags |= PADATA_RESET;
@@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
synchronize_rcu();
+ if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
+ notification_mask |= PADATA_CPU_PARALLEL;
+ if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
+ notification_mask |= PADATA_CPU_SERIAL;
+
padata_flush_queues(pd_old);
padata_free_pd(pd_old);
+ if (notification_mask)
+ blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
+ notification_mask,
+ &pd_new->cpumask);
+
pinst->flags &= ~PADATA_RESET;
}
/**
- * padata_set_cpumask - set the cpumask that padata should use
+ * padata_register_cpumask_notifier - Registers a notifier that will be called
+ * if either pcpu or cbcpu or both cpumasks change.
*
- * @pinst: padata instance
- * @cpumask: the cpumask to use
+ * @pinst: A poineter to padata instance
+ * @nblock: A pointer to notifier block.
*/
-int padata_set_cpumask(struct padata_instance *pinst,
- cpumask_var_t cpumask)
+int padata_register_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
{
+ return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_register_cpumask_notifier);
+
+/**
+ * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
+ * registered earlier using padata_register_cpumask_notifier
+ *
+ * @pinst: A pointer to data instance.
+ * @nlock: A pointer to notifier block.
+ */
+int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
+ struct notifier_block *nblock)
+{
+ return blocking_notifier_chain_unregister(
+ &pinst->cpumask_change_notifier,
+ nblock);
+}
+EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
+
+
+/* If cpumask contains no active cpu, we mark the instance as invalid. */
+static bool padata_validate_cpumask(struct padata_instance *pinst,
+ const struct cpumask *cpumask)
+{
+ if (!cpumask_intersects(cpumask, cpu_active_mask)) {
+ pinst->flags |= PADATA_INVALID;
+ return false;
+ }
+
+ pinst->flags &= ~PADATA_INVALID;
+ return true;
+}
+
+static int __padata_set_cpumasks(struct padata_instance *pinst,
+ cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int valid;
struct parallel_data *pd;
- int err = 0;
+
+ valid = padata_validate_cpumask(pinst, pcpumask);
+ if (!valid) {
+ __padata_stop(pinst);
+ goto out_replace;
+ }
+
+ valid = padata_validate_cpumask(pinst, cbcpumask);
+ if (!valid)
+ __padata_stop(pinst);
+
+out_replace:
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ return -ENOMEM;
+
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
+
+ padata_replace(pinst, pd);
+
+ if (valid)
+ __padata_start(pinst);
+
+ return 0;
+}
+
+/**
+ * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
+ * one is used by parallel workers and the second one
+ * by the wokers doing serialization.
+ *
+ * @pinst: padata instance
+ * @pcpumask: the cpumask to use for parallel workers
+ * @cbcpumask: the cpumsak to use for serial workers
+ */
+int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
+ cpumask_var_t cbcpumask)
+{
+ int err;
mutex_lock(&pinst->lock);
+ get_online_cpus();
+ err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
+
+ put_online_cpus();
+ mutex_unlock(&pinst->lock);
+
+ return err;
+
+}
+EXPORT_SYMBOL(padata_set_cpumasks);
+
+/**
+ * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
+ * equivalent to @cpumask.
+ *
+ * @pinst: padata instance
+ * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
+ * to parallel and serial cpumasks respectively.
+ * @cpumask: the cpumask to use
+ */
+int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
+ cpumask_var_t cpumask)
+{
+ struct cpumask *serial_mask, *parallel_mask;
+ int err = -EINVAL;
+
+ mutex_lock(&pinst->lock);
get_online_cpus();
- pd = padata_alloc_pd(pinst, cpumask);
- if (!pd) {
- err = -ENOMEM;
- goto out;
+ switch (cpumask_type) {
+ case PADATA_CPU_PARALLEL:
+ serial_mask = pinst->cpumask.cbcpu;
+ parallel_mask = cpumask;
+ break;
+ case PADATA_CPU_SERIAL:
+ parallel_mask = pinst->cpumask.pcpu;
+ serial_mask = cpumask;
+ break;
+ default:
+ goto out;
}
- cpumask_copy(pinst->cpumask, cpumask);
-
- padata_replace(pinst, pd);
+ err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
out:
put_online_cpus();
-
mutex_unlock(&pinst->lock);
return err;
@@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
struct parallel_data *pd;
if (cpumask_test_cpu(cpu, cpu_active_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask);
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
if (!pd)
return -ENOMEM;
padata_replace(pinst, pd);
+
+ if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
+ padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_start(pinst);
}
return 0;
}
-/**
- * padata_add_cpu - add a cpu to the padata cpumask
+ /**
+ * padata_add_cpu - add a cpu to one or both(parallel and serial)
+ * padata cpumasks.
*
* @pinst: padata instance
* @cpu: cpu to add
+ * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
*/
-int padata_add_cpu(struct padata_instance *pinst, int cpu)
+
+int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
{
int err;
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
+
mutex_lock(&pinst->lock);
get_online_cpus();
- cpumask_set_cpu(cpu, pinst->cpumask);
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
+
err = __padata_add_cpu(pinst, cpu);
put_online_cpus();
@@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
{
- struct parallel_data *pd;
+ struct parallel_data *pd = NULL;
if (cpumask_test_cpu(cpu, cpu_online_mask)) {
- pd = padata_alloc_pd(pinst, pinst->cpumask);
+
+ if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
+ !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
+ __padata_stop(pinst);
+
+ pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
+ pinst->cpumask.cbcpu);
if (!pd)
return -ENOMEM;
@@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
return 0;
}
-/**
- * padata_remove_cpu - remove a cpu from the padata cpumask
+ /**
+ * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata cpumasks.
*
* @pinst: padata instance
* @cpu: cpu to remove
+ * @mask: bitmask specifying from which cpumask @cpu should be removed
+ * The @mask may be any combination of the following flags:
+ * PADATA_CPU_SERIAL - serial cpumask
+ * PADATA_CPU_PARALLEL - parallel cpumask
*/
-int padata_remove_cpu(struct padata_instance *pinst, int cpu)
+int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
{
int err;
+ if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
+ return -EINVAL;
+
mutex_lock(&pinst->lock);
get_online_cpus();
- cpumask_clear_cpu(cpu, pinst->cpumask);
+ if (mask & PADATA_CPU_SERIAL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
+ if (mask & PADATA_CPU_PARALLEL)
+ cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
+
err = __padata_remove_cpu(pinst, cpu);
put_online_cpus();
@@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu);
*
* @pinst: padata instance to start
*/
-void padata_start(struct padata_instance *pinst)
+int padata_start(struct padata_instance *pinst)
{
+ int err = 0;
+
mutex_lock(&pinst->lock);
- pinst->flags |= PADATA_INIT;
+
+ if (pinst->flags & PADATA_INVALID)
+ err =-EINVAL;
+
+ __padata_start(pinst);
+
mutex_unlock(&pinst->lock);
+
+ return err;
}
EXPORT_SYMBOL(padata_start);
@@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start);
void padata_stop(struct padata_instance *pinst)
{
mutex_lock(&pinst->lock);
- pinst->flags &= ~PADATA_INIT;
+ __padata_stop(pinst);
mutex_unlock(&pinst->lock);
}
EXPORT_SYMBOL(padata_stop);
#ifdef CONFIG_HOTPLUG_CPU
+
+static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
+{
+ return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
+ cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
+}
+
+
static int padata_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
@@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
err = __padata_add_cpu(pinst, cpu);
@@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
err = __padata_remove_cpu(pinst, cpu);
@@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
__padata_remove_cpu(pinst, cpu);
@@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb,
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- if (!cpumask_test_cpu(cpu, pinst->cpumask))
+ if (!pinst_has_cpu(pinst, cpu))
break;
mutex_lock(&pinst->lock);
__padata_add_cpu(pinst, cpu);
@@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb,
}
#endif
+static void __padata_free(struct padata_instance *pinst)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+ padata_stop(pinst);
+ padata_free_pd(pinst->pd);
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
+ kfree(pinst);
+}
+
+#define kobj2pinst(_kobj) \
+ container_of(_kobj, struct padata_instance, kobj)
+#define attr2pentry(_attr) \
+ container_of(_attr, struct padata_sysfs_entry, attr)
+
+static void padata_sysfs_release(struct kobject *kobj)
+{
+ struct padata_instance *pinst = kobj2pinst(kobj);
+ __padata_free(pinst);
+}
+
+struct padata_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
+ ssize_t (*store)(struct padata_instance *, struct attribute *,
+ const char *, size_t);
+};
+
+static ssize_t show_cpumask(struct padata_instance *pinst,
+ struct attribute *attr, char *buf)
+{
+ struct cpumask *cpumask;
+ ssize_t len;
+
+ mutex_lock(&pinst->lock);
+ if (!strcmp(attr->name, "serial_cpumask"))
+ cpumask = pinst->cpumask.cbcpu;
+ else
+ cpumask = pinst->cpumask.pcpu;
+
+ len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
+ nr_cpu_ids);
+ if (PAGE_SIZE - len < 2)
+ len = -EINVAL;
+ else
+ len += sprintf(buf + len, "\n");
+
+ mutex_unlock(&pinst->lock);
+ return len;
+}
+
+static ssize_t store_cpumask(struct padata_instance *pinst,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_cpumask;
+ ssize_t ret;
+ int mask_type;
+
+ if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
+ nr_cpumask_bits);
+ if (ret < 0)
+ goto out;
+
+ mask_type = !strcmp(attr->name, "serial_cpumask") ?
+ PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
+ ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
+ if (!ret)
+ ret = count;
+
+out:
+ free_cpumask_var(new_cpumask);
+ return ret;
+}
+
+#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0644, _show_name, _store_name)
+#define PADATA_ATTR_RO(_name, _show_name) \
+ static struct padata_sysfs_entry _name##_attr = \
+ __ATTR(_name, 0400, _show_name, NULL)
+
+PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
+PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
+
+/*
+ * Padata sysfs provides the following objects:
+ * serial_cpumask [RW] - cpumask for serial workers
+ * parallel_cpumask [RW] - cpumask for parallel workers
+ */
+static struct attribute *padata_default_attrs[] = {
+ &serial_cpumask_attr.attr,
+ &parallel_cpumask_attr.attr,
+ NULL,
+};
+
+static ssize_t padata_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->show(pinst, attr, buf);
+
+ return ret;
+}
+
+static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct padata_instance *pinst;
+ struct padata_sysfs_entry *pentry;
+ ssize_t ret = -EIO;
+
+ pinst = kobj2pinst(kobj);
+ pentry = attr2pentry(attr);
+ if (pentry->show)
+ ret = pentry->store(pinst, attr, buf, count);
+
+ return ret;
+}
+
+static const struct sysfs_ops padata_sysfs_ops = {
+ .show = padata_sysfs_show,
+ .store = padata_sysfs_store,
+};
+
+static struct kobj_type padata_attr_type = {
+ .sysfs_ops = &padata_sysfs_ops,
+ .default_attrs = padata_default_attrs,
+ .release = padata_sysfs_release,
+};
+
/**
- * padata_alloc - allocate and initialize a padata instance
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ * Use the cpu_possible_mask for serial and
+ * parallel workers.
*
- * @cpumask: cpumask that padata uses for parallelization
* @wq: workqueue to use for the allocated padata instance
*/
-struct padata_instance *padata_alloc(const struct cpumask *cpumask,
- struct workqueue_struct *wq)
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+ return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
+/**
+ * padata_alloc - allocate and initialize a padata instance and specify
+ * cpumasks for serial and parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ * @pcpumask: cpumask that will be used for padata parallelization
+ * @cbcpumask: cpumask that will be used for padata serialization
+ */
+struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+ const struct cpumask *pcpumask,
+ const struct cpumask *cbcpumask)
{
struct padata_instance *pinst;
- struct parallel_data *pd;
+ struct parallel_data *pd = NULL;
pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
if (!pinst)
goto err;
get_online_cpus();
-
- pd = padata_alloc_pd(pinst, cpumask);
- if (!pd)
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
goto err_free_inst;
+ if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
+ free_cpumask_var(pinst->cpumask.pcpu);
+ goto err_free_inst;
+ }
+ if (!padata_validate_cpumask(pinst, pcpumask) ||
+ !padata_validate_cpumask(pinst, cbcpumask))
+ goto err_free_masks;
- if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
- goto err_free_pd;
+ pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
+ if (!pd)
+ goto err_free_masks;
rcu_assign_pointer(pinst->pd, pd);
pinst->wq = wq;
- cpumask_copy(pinst->cpumask, cpumask);
+ cpumask_copy(pinst->cpumask.pcpu, pcpumask);
+ cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
pinst->flags = 0;
@@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
put_online_cpus();
+ BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
+ kobject_init(&pinst->kobj, &padata_attr_type);
mutex_init(&pinst->lock);
return pinst;
-err_free_pd:
- padata_free_pd(pd);
+err_free_masks:
+ free_cpumask_var(pinst->cpumask.pcpu);
+ free_cpumask_var(pinst->cpumask.cbcpu);
err_free_inst:
kfree(pinst);
put_online_cpus();
@@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc);
*/
void padata_free(struct padata_instance *pinst)
{
- padata_stop(pinst);
-
- synchronize_rcu();
-
-#ifdef CONFIG_HOTPLUG_CPU
- unregister_hotcpu_notifier(&pinst->cpu_notifier);
-#endif
- get_online_cpus();
- padata_flush_queues(pinst->pd);
- put_online_cpus();
-
- padata_free_pd(pinst->pd);
- free_cpumask_var(pinst->cpumask);
- kfree(pinst);
+ kobject_put(&pinst->kobj);
}
EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 3b16cd93fa7d..4c13b1a88ebb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,9 @@
#include <linux/nmi.h>
#include <linux/dmi.h>
+#define PANIC_TIMER_STEP 100
+#define PANIC_BLINK_SPD 18
+
int panic_on_oops;
static unsigned long tainted_mask;
static int pause_on_oops;
@@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-/* Returns how long it waited in ms */
-long (*panic_blink)(long time);
-EXPORT_SYMBOL(panic_blink);
-
-static void panic_blink_one_second(void)
+static long no_blink(int state)
{
- static long i = 0, end;
-
- if (panic_blink) {
- end = i + MSEC_PER_SEC;
-
- while (i < end) {
- i += panic_blink(i);
- mdelay(1);
- i++;
- }
- } else {
- /*
- * When running under a hypervisor a small mdelay may get
- * rounded up to the hypervisor timeslice. For example, with
- * a 1ms in 10ms hypervisor timeslice we might inflate a
- * mdelay(1) loop by 10x.
- *
- * If we have nothing to blink, spin on 1 second calls to
- * mdelay to avoid this.
- */
- mdelay(MSEC_PER_SEC);
- }
+ return 0;
}
+/* Returns how long it waited in ms */
+long (*panic_blink)(int state);
+EXPORT_SYMBOL(panic_blink);
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...)
{
static char buf[1024];
va_list args;
- long i;
+ long i, i_next = 0;
+ int state = 0;
/*
* It's possible to come here directly from a panic-assertion and
@@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...)
bust_spinlocks(0);
+ if (!panic_blink)
+ panic_blink = no_blink;
+
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
@@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
- for (i = 0; i < panic_timeout; i++) {
+ for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
- panic_blink_one_second();
+ if (i >= i_next) {
+ i += panic_blink(state ^= 1);
+ i_next = i + 3600 / PANIC_BLINK_SPD;
+ }
+ mdelay(PANIC_TIMER_STEP);
}
/*
* This will not be a clean reboot, with everything
@@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...)
}
#endif
local_irq_enable();
- while (1) {
+ for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
- panic_blink_one_second();
+ if (i >= i_next) {
+ i += panic_blink(state ^= 1);
+ i_next = i + 3600 / PANIC_BLINK_SPD;
+ }
+ mdelay(PANIC_TIMER_STEP);
}
}
@@ -344,7 +338,7 @@ static int init_oops_id(void)
}
late_initcall(init_oops_id);
-static void print_oops_end_marker(void)
+void print_oops_end_marker(void)
{
init_oops_id();
printk(KERN_WARNING "---[ end trace %016llx ]---\n",
diff --git a/kernel/params.c b/kernel/params.c
index 0b30ecd53a52..08107d181758 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -31,6 +31,42 @@
#define DEBUGP(fmt, a...)
#endif
+/* Protects all parameters, and incidentally kmalloced_param list. */
+static DEFINE_MUTEX(param_lock);
+
+/* This just allows us to keep track of which parameters are kmalloced. */
+struct kmalloced_param {
+ struct list_head list;
+ char val[];
+};
+static LIST_HEAD(kmalloced_params);
+
+static void *kmalloc_parameter(unsigned int size)
+{
+ struct kmalloced_param *p;
+
+ p = kmalloc(sizeof(*p) + size, GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ list_add(&p->list, &kmalloced_params);
+ return p->val;
+}
+
+/* Does nothing if parameter wasn't kmalloced above. */
+static void maybe_kfree_parameter(void *param)
+{
+ struct kmalloced_param *p;
+
+ list_for_each_entry(p, &kmalloced_params, list) {
+ if (p->val == param) {
+ list_del(&p->list);
+ kfree(p);
+ break;
+ }
+ }
+}
+
static inline char dash2underscore(char c)
{
if (c == '-')
@@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname)
static int parse_one(char *param,
char *val,
- struct kernel_param *params,
+ const struct kernel_param *params,
unsigned num_params,
int (*handle_unknown)(char *param, char *val))
{
unsigned int i;
+ int err;
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
+ /* Noone handled NULL, so do it here. */
+ if (!val && params[i].ops->set != param_set_bool)
+ return -EINVAL;
DEBUGP("They are equal! Calling %p\n",
- params[i].set);
- return params[i].set(val, &params[i]);
+ params[i].ops->set);
+ mutex_lock(&param_lock);
+ err = params[i].ops->set(val, &params[i]);
+ mutex_unlock(&param_lock);
+ return err;
}
}
@@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val)
/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
int parse_args(const char *name,
char *args,
- struct kernel_param *params,
+ const struct kernel_param *params,
unsigned num,
int (*unknown)(char *param, char *val))
{
@@ -176,22 +219,29 @@ int parse_args(const char *name,
/* Lazy bastard, eh? */
#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
- int param_set_##name(const char *val, struct kernel_param *kp) \
+ int param_set_##name(const char *val, const struct kernel_param *kp) \
{ \
tmptype l; \
int ret; \
\
- if (!val) return -EINVAL; \
ret = strtolfn(val, 0, &l); \
if (ret == -EINVAL || ((type)l != l)) \
return -EINVAL; \
*((type *)kp->arg) = l; \
return 0; \
} \
- int param_get_##name(char *buffer, struct kernel_param *kp) \
+ int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
return sprintf(buffer, format, *((type *)kp->arg)); \
- }
+ } \
+ struct kernel_param_ops param_ops_##name = { \
+ .set = param_set_##name, \
+ .get = param_get_##name, \
+ }; \
+ EXPORT_SYMBOL(param_set_##name); \
+ EXPORT_SYMBOL(param_get_##name); \
+ EXPORT_SYMBOL(param_ops_##name)
+
STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
@@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
-int param_set_charp(const char *val, struct kernel_param *kp)
+int param_set_charp(const char *val, const struct kernel_param *kp)
{
- if (!val) {
- printk(KERN_ERR "%s: string parameter expected\n",
- kp->name);
- return -EINVAL;
- }
-
if (strlen(val) > 1024) {
printk(KERN_ERR "%s: string parameter too long\n",
kp->name);
return -ENOSPC;
}
- /* This is a hack. We can't need to strdup in early boot, and we
+ maybe_kfree_parameter(*(char **)kp->arg);
+
+ /* This is a hack. We can't kmalloc in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
- *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+ *(char **)kp->arg = kmalloc_parameter(strlen(val)+1);
if (!*(char **)kp->arg)
return -ENOMEM;
+ strcpy(*(char **)kp->arg, val);
} else
*(const char **)kp->arg = val;
return 0;
}
+EXPORT_SYMBOL(param_set_charp);
-int param_get_charp(char *buffer, struct kernel_param *kp)
+int param_get_charp(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%s", *((char **)kp->arg));
}
+EXPORT_SYMBOL(param_get_charp);
+
+static void param_free_charp(void *arg)
+{
+ maybe_kfree_parameter(*((char **)arg));
+}
+
+struct kernel_param_ops param_ops_charp = {
+ .set = param_set_charp,
+ .get = param_get_charp,
+ .free = param_free_charp,
+};
+EXPORT_SYMBOL(param_ops_charp);
/* Actually could be a bool or an int, for historical reasons. */
-int param_set_bool(const char *val, struct kernel_param *kp)
+int param_set_bool(const char *val, const struct kernel_param *kp)
{
bool v;
@@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp)
*(int *)kp->arg = v;
return 0;
}
+EXPORT_SYMBOL(param_set_bool);
-int param_get_bool(char *buffer, struct kernel_param *kp)
+int param_get_bool(char *buffer, const struct kernel_param *kp)
{
bool val;
if (kp->flags & KPARAM_ISBOOL)
@@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp)
/* Y and N chosen as being relatively non-coder friendly */
return sprintf(buffer, "%c", val ? 'Y' : 'N');
}
+EXPORT_SYMBOL(param_get_bool);
+
+struct kernel_param_ops param_ops_bool = {
+ .set = param_set_bool,
+ .get = param_get_bool,
+};
+EXPORT_SYMBOL(param_ops_bool);
/* This one must be bool. */
-int param_set_invbool(const char *val, struct kernel_param *kp)
+int param_set_invbool(const char *val, const struct kernel_param *kp)
{
int ret;
bool boolval;
@@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp)
*(bool *)kp->arg = !boolval;
return ret;
}
+EXPORT_SYMBOL(param_set_invbool);
-int param_get_invbool(char *buffer, struct kernel_param *kp)
+int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
}
+EXPORT_SYMBOL(param_get_invbool);
+
+struct kernel_param_ops param_ops_invbool = {
+ .set = param_set_invbool,
+ .get = param_get_invbool,
+};
+EXPORT_SYMBOL(param_ops_invbool);
/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
void *elem, int elemsize,
- int (*set)(const char *, struct kernel_param *kp),
+ int (*set)(const char *, const struct kernel_param *kp),
u16 flags,
unsigned int *num)
{
@@ -309,12 +386,6 @@ static int param_array(const char *name,
kp.arg = elem;
kp.flags = flags;
- /* No equals sign? */
- if (!val) {
- printk(KERN_ERR "%s: expects arguments\n", name);
- return -EINVAL;
- }
-
*num = 0;
/* We expect a comma-separated list of values. */
do {
@@ -330,6 +401,7 @@ static int param_array(const char *name,
/* nul-terminate and parse */
save = val[len];
((char *)val)[len] = '\0';
+ BUG_ON(!mutex_is_locked(&param_lock));
ret = set(val, &kp);
if (ret != 0)
@@ -347,17 +419,17 @@ static int param_array(const char *name,
return 0;
}
-int param_array_set(const char *val, struct kernel_param *kp)
+static int param_array_set(const char *val, const struct kernel_param *kp)
{
const struct kparam_array *arr = kp->arr;
unsigned int temp_num;
return param_array(kp->name, val, 1, arr->max, arr->elem,
- arr->elemsize, arr->set, kp->flags,
+ arr->elemsize, arr->ops->set, kp->flags,
arr->num ?: &temp_num);
}
-int param_array_get(char *buffer, struct kernel_param *kp)
+static int param_array_get(char *buffer, const struct kernel_param *kp)
{
int i, off, ret;
const struct kparam_array *arr = kp->arr;
@@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp)
if (i)
buffer[off++] = ',';
p.arg = arr->elem + arr->elemsize * i;
- ret = arr->get(buffer + off, &p);
+ BUG_ON(!mutex_is_locked(&param_lock));
+ ret = arr->ops->get(buffer + off, &p);
if (ret < 0)
return ret;
off += ret;
@@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp)
return off;
}
-int param_set_copystring(const char *val, struct kernel_param *kp)
+static void param_array_free(void *arg)
+{
+ unsigned int i;
+ const struct kparam_array *arr = arg;
+
+ if (arr->ops->free)
+ for (i = 0; i < (arr->num ? *arr->num : arr->max); i++)
+ arr->ops->free(arr->elem + arr->elemsize * i);
+}
+
+struct kernel_param_ops param_array_ops = {
+ .set = param_array_set,
+ .get = param_array_get,
+ .free = param_array_free,
+};
+EXPORT_SYMBOL(param_array_ops);
+
+int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- if (!val) {
- printk(KERN_ERR "%s: missing param set value\n", kp->name);
- return -EINVAL;
- }
if (strlen(val)+1 > kps->maxlen) {
printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
@@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
strcpy(kps->string, val);
return 0;
}
+EXPORT_SYMBOL(param_set_copystring);
-int param_get_string(char *buffer, struct kernel_param *kp)
+int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
return strlcpy(buffer, kps->string, kps->maxlen);
}
+EXPORT_SYMBOL(param_get_string);
+
+struct kernel_param_ops param_ops_string = {
+ .set = param_set_copystring,
+ .get = param_get_string,
+};
+EXPORT_SYMBOL(param_ops_string);
/* sysfs output in /sys/modules/XYZ/parameters/ */
#define to_module_attr(n) container_of(n, struct module_attribute, attr)
@@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[];
struct param_attribute
{
struct module_attribute mattr;
- struct kernel_param *param;
+ const struct kernel_param *param;
};
struct module_param_attrs
@@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
int count;
struct param_attribute *attribute = to_param_attr(mattr);
- if (!attribute->param->get)
+ if (!attribute->param->ops->get)
return -EPERM;
- count = attribute->param->get(buf, attribute->param);
+ mutex_lock(&param_lock);
+ count = attribute->param->ops->get(buf, attribute->param);
+ mutex_unlock(&param_lock);
if (count > 0) {
strcat(buf, "\n");
++count;
@@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
int err;
struct param_attribute *attribute = to_param_attr(mattr);
- if (!attribute->param->set)
+ if (!attribute->param->ops->set)
return -EPERM;
- err = attribute->param->set(buf, attribute->param);
+ mutex_lock(&param_lock);
+ err = attribute->param->ops->set(buf, attribute->param);
+ mutex_unlock(&param_lock);
if (!err)
return len;
return err;
@@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
#endif
#ifdef CONFIG_SYSFS
+void __kernel_param_lock(void)
+{
+ mutex_lock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_lock);
+
+void __kernel_param_unlock(void)
+{
+ mutex_unlock(&param_lock);
+}
+EXPORT_SYMBOL(__kernel_param_unlock);
+
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
@@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
* if there's an error.
*/
static __modinit int add_sysfs_param(struct module_kobject *mk,
- struct kernel_param *kp,
+ const struct kernel_param *kp,
const char *name)
{
struct module_param_attrs *new;
@@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk)
* /sys/module/[mod->name]/parameters/
*/
int module_param_sysfs_setup(struct module *mod,
- struct kernel_param *kparam,
+ const struct kernel_param *kparam,
unsigned int num_params)
{
int i, err;
@@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod)
void destroy_params(const struct kernel_param *params, unsigned num)
{
- /* FIXME: This should free kmalloced charp parameters. It doesn't. */
+ unsigned int i;
+
+ for (i = 0; i < num; i++)
+ if (params[i].ops->free)
+ params[i].ops->free(params[i].arg);
}
static void __init kernel_add_sysfs_param(const char *name,
@@ -768,28 +882,3 @@ static int __init param_sysfs_init(void)
subsys_initcall(param_sysfs_init);
#endif /* CONFIG_SYSFS */
-
-EXPORT_SYMBOL(param_set_byte);
-EXPORT_SYMBOL(param_get_byte);
-EXPORT_SYMBOL(param_set_short);
-EXPORT_SYMBOL(param_get_short);
-EXPORT_SYMBOL(param_set_ushort);
-EXPORT_SYMBOL(param_get_ushort);
-EXPORT_SYMBOL(param_set_int);
-EXPORT_SYMBOL(param_get_int);
-EXPORT_SYMBOL(param_set_uint);
-EXPORT_SYMBOL(param_get_uint);
-EXPORT_SYMBOL(param_set_long);
-EXPORT_SYMBOL(param_get_long);
-EXPORT_SYMBOL(param_set_ulong);
-EXPORT_SYMBOL(param_get_ulong);
-EXPORT_SYMBOL(param_set_charp);
-EXPORT_SYMBOL(param_get_charp);
-EXPORT_SYMBOL(param_set_bool);
-EXPORT_SYMBOL(param_get_bool);
-EXPORT_SYMBOL(param_set_invbool);
-EXPORT_SYMBOL(param_get_invbool);
-EXPORT_SYMBOL(param_array_set);
-EXPORT_SYMBOL(param_array_get);
-EXPORT_SYMBOL(param_set_copystring);
-EXPORT_SYMBOL(param_get_string);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f416aef242c3..0d38f27ad885 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
static inline u64 perf_clock(void)
{
- return cpu_clock(raw_smp_processor_id());
+ return local_clock();
}
/*
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..d55c6fb8d087 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid)
atomic_inc(&map->nr_free);
}
+/*
+ * If we started walking pids at 'base', is 'a' seen before 'b'?
+ */
+static int pid_before(int base, int a, int b)
+{
+ /*
+ * This is the same as saying
+ *
+ * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
+ * and that mapping orders 'a' and 'b' with respect to 'base'.
+ */
+ return (unsigned)(a - base) < (unsigned)(b - base);
+}
+
+/*
+ * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We want the winner to have the "later" value, because if the
+ * "earlier" value prevails, then a pid may get reused immediately.
+ *
+ * Since pids rollover, it is not sufficient to just pick the bigger
+ * value. We have to consider where we started counting from.
+ *
+ * 'base' is the value of pid_ns->last_pid that we observed when
+ * we started looking for a pid.
+ *
+ * 'pid' is the pid that we eventually found.
+ */
+static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
+{
+ int prev;
+ int last_write = base;
+ do {
+ prev = last_write;
+ last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
+ } while ((prev != last_write) && (pid_before(base, last_write, pid)));
+}
+
static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
@@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
- max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
+ /*
+ * If last_pid points into the middle of the map->page we
+ * want to scan this bitmap block twice, the second time
+ * we start with offset == 0 (or RESERVED_PIDS).
+ */
+ max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
- pid_ns->last_pid = pid;
+ set_last_pid(pid_ns, last, pid);
return pid;
}
offset = find_next_offset(map, offset);
pid = mk_pid(pid_ns, map, offset);
- /*
- * find_next_offset() found a bit, the pid from it
- * is in-bounds, and if we fell back to the last
- * bitmap block and the final block was the same
- * as the starting point, pid is before last_pid.
- */
- } while (offset < BITS_PER_PAGE && pid < pid_max &&
- (i != max_scan || pid < last ||
- !((last+1) & BITS_PER_PAGE_MASK)));
+ } while (offset < BITS_PER_PAGE && pid < pid_max);
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index f42d3f737a33..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -48,59 +48,49 @@
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-struct pm_qos_request_list {
- struct list_head list;
- union {
- s32 value;
- s32 usec;
- s32 kbps;
- };
- int pm_qos_class;
+enum pm_qos_type {
+ PM_QOS_MAX, /* return the largest value */
+ PM_QOS_MIN /* return the smallest value */
};
-static s32 max_compare(s32 v1, s32 v2);
-static s32 min_compare(s32 v1, s32 v2);
-
struct pm_qos_object {
- struct pm_qos_request_list requests;
+ struct plist_head requests;
struct blocking_notifier_head *notifiers;
struct miscdevice pm_qos_power_miscdev;
char *name;
s32 default_value;
- atomic_t target_value;
- s32 (*comparitor)(s32, s32);
+ enum pm_qos_type type;
};
+static DEFINE_SPINLOCK(pm_qos_lock);
+
static struct pm_qos_object null_pm_qos;
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
static struct pm_qos_object cpu_dma_pm_qos = {
- .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN,
};
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
static struct pm_qos_object network_lat_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
.notifiers = &network_lat_notifier,
.name = "network_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
- .comparitor = min_compare
+ .type = PM_QOS_MIN
};
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_object network_throughput_pm_qos = {
- .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
+ .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.default_value = 0,
- .target_value = ATOMIC_INIT(0),
- .comparitor = max_compare
+ .type = PM_QOS_MAX,
};
@@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
&network_throughput_pm_qos
};
-static DEFINE_SPINLOCK(pm_qos_lock);
-
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
.release = pm_qos_power_release,
};
-/* static helper functions */
-static s32 max_compare(s32 v1, s32 v2)
+/* unlocked internal variant */
+static inline int pm_qos_get_value(struct pm_qos_object *o)
{
- return max(v1, v2);
-}
+ if (plist_head_empty(&o->requests))
+ return o->default_value;
-static s32 min_compare(s32 v1, s32 v2)
-{
- return min(v1, v2);
-}
+ switch (o->type) {
+ case PM_QOS_MIN:
+ return plist_last(&o->requests)->prio;
+ case PM_QOS_MAX:
+ return plist_first(&o->requests)->prio;
-static void update_target(int pm_qos_class)
+ default:
+ /* runtime check for not using enum */
+ BUG();
+ }
+}
+
+static void update_target(struct pm_qos_object *o, struct plist_node *node,
+ int del, int value)
{
- s32 extreme_value;
- struct pm_qos_request_list *node;
unsigned long flags;
- int call_notifier = 0;
+ int prev_value, curr_value;
spin_lock_irqsave(&pm_qos_lock, flags);
- extreme_value = pm_qos_array[pm_qos_class]->default_value;
- list_for_each_entry(node,
- &pm_qos_array[pm_qos_class]->requests.list, list) {
- extreme_value = pm_qos_array[pm_qos_class]->comparitor(
- extreme_value, node->value);
- }
- if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
- extreme_value) {
- call_notifier = 1;
- atomic_set(&pm_qos_array[pm_qos_class]->target_value,
- extreme_value);
- pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
- atomic_read(&pm_qos_array[pm_qos_class]->target_value));
+ prev_value = pm_qos_get_value(o);
+ /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+ if (value != PM_QOS_DEFAULT_VALUE) {
+ /*
+ * to change the list, we atomically remove, reinit
+ * with new value and add, then see if the extremal
+ * changed
+ */
+ plist_del(node, &o->requests);
+ plist_node_init(node, value);
+ plist_add(node, &o->requests);
+ } else if (del) {
+ plist_del(node, &o->requests);
+ } else {
+ plist_add(node, &o->requests);
}
+ curr_value = pm_qos_get_value(o);
spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (call_notifier)
- blocking_notifier_call_chain(
- pm_qos_array[pm_qos_class]->notifiers,
- (unsigned long) extreme_value, NULL);
+ if (prev_value != curr_value)
+ blocking_notifier_call_chain(o->notifiers,
+ (unsigned long)curr_value,
+ NULL);
}
static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor)
*/
int pm_qos_request(int pm_qos_class)
{
- return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
+ unsigned long flags;
+ int value;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return value;
}
EXPORT_SYMBOL_GPL(pm_qos_request);
+int pm_qos_request_active(struct pm_qos_request_list *req)
+{
+ return req->pm_qos_class != 0;
+}
+EXPORT_SYMBOL_GPL(pm_qos_request_active);
+
/**
* pm_qos_add_request - inserts new qos request into the list
* @pm_qos_class: identifies which list of qos request to us
@@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
* element as a handle for use in updating and removal. Call needs to save
* this handle for later use.
*/
-struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
+void pm_qos_add_request(struct pm_qos_request_list *dep,
+ int pm_qos_class, s32 value)
{
- struct pm_qos_request_list *dep;
- unsigned long flags;
+ struct pm_qos_object *o = pm_qos_array[pm_qos_class];
+ int new_value;
- dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
- if (dep) {
- if (value == PM_QOS_DEFAULT_VALUE)
- dep->value = pm_qos_array[pm_qos_class]->default_value;
- else
- dep->value = value;
- dep->pm_qos_class = pm_qos_class;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_add(&dep->list,
- &pm_qos_array[pm_qos_class]->requests.list);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(pm_qos_class);
+ if (pm_qos_request_active(dep)) {
+ WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
+ return;
}
-
- return dep;
+ if (value == PM_QOS_DEFAULT_VALUE)
+ new_value = o->default_value;
+ else
+ new_value = value;
+ plist_node_init(&dep->list, new_value);
+ dep->pm_qos_class = pm_qos_class;
+ update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
@@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
* Attempts are made to make this code callable on hot code paths.
*/
void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
- s32 new_value)
+ s32 new_value)
{
- unsigned long flags;
- int pending_update = 0;
s32 temp;
+ struct pm_qos_object *o;
+
+ if (!pm_qos_req) /*guard against callers passing in null */
+ return;
- if (pm_qos_req) { /*guard against callers passing in null */
- spin_lock_irqsave(&pm_qos_lock, flags);
- if (new_value == PM_QOS_DEFAULT_VALUE)
- temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
- else
- temp = new_value;
-
- if (temp != pm_qos_req->value) {
- pending_update = 1;
- pm_qos_req->value = temp;
- }
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (pending_update)
- update_target(pm_qos_req->pm_qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ return;
}
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+
+ if (new_value == PM_QOS_DEFAULT_VALUE)
+ temp = o->default_value;
+ else
+ temp = new_value;
+
+ if (temp != pm_qos_req->list.prio)
+ update_target(o, &pm_qos_req->list, 0, temp);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
@@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request);
*/
void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
{
- unsigned long flags;
- int qos_class;
+ struct pm_qos_object *o;
if (pm_qos_req == NULL)
return;
/* silent return to keep pcm code cleaner */
- qos_class = pm_qos_req->pm_qos_class;
- spin_lock_irqsave(&pm_qos_lock, flags);
- list_del(&pm_qos_req->list);
- kfree(pm_qos_req);
- spin_unlock_irqrestore(&pm_qos_lock, flags);
- update_target(qos_class);
+ if (!pm_qos_request_active(pm_qos_req)) {
+ WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
+ return;
+ }
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+ update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+ memset(pm_qos_req, 0, sizeof(*pm_qos_req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
if (pm_qos_class >= 0) {
- filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
- PM_QOS_DEFAULT_VALUE);
+ struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
+ if (!req)
+ return -ENOMEM;
+
+ pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
+ filp->private_data = req;
if (filp->private_data)
return 0;
@@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
{
struct pm_qos_request_list *req;
- req = (struct pm_qos_request_list *)filp->private_data;
+ req = filp->private_data;
pm_qos_remove_request(req);
+ kfree(req);
return 0;
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..6842eeba5879 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -16,13 +16,13 @@
* siglock protection since other code may update expiration cache as
* well.
*/
-void update_rlimit_cpu(unsigned long rlim_new)
+void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
cputime_t cputime = secs_to_cputime(rlim_new);
- spin_lock_irq(&current->sighand->siglock);
- set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
- spin_unlock_irq(&current->sighand->siglock);
+ spin_lock_irq(&task->sighand->siglock);
+ set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
+ spin_unlock_irq(&task->sighand->siglock);
}
static int check_clock(const clockid_t which_clock)
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
- struct sighand_struct *sighand;
- struct signal_struct *sig;
+ struct signal_struct *sig = tsk->signal;
struct task_struct *t;
- *times = INIT_CPUTIME;
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
rcu_read_lock();
- sighand = rcu_dereference(tsk->sighand);
- if (!sighand)
+ /* make sure we can trust tsk->thread_group list */
+ if (!likely(pid_alive(tsk)))
goto out;
- sig = tsk->signal;
-
t = tsk;
do {
times->utime = cputime_add(times->utime, t->utime);
times->stime = cputime_add(times->stime, t->stime);
times->sum_exec_runtime += t->se.sum_exec_runtime;
-
- t = next_thread(t);
- } while (t != tsk);
-
- times->utime = cputime_add(times->utime, sig->utime);
- times->stime = cputime_add(times->stime, sig->stime);
- times->sum_exec_runtime += sig->sum_sched_runtime;
+ } while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
- /* tsk == current, ensure it is safe to use ->signal/sighand */
- if (unlikely(tsk->exit_state))
- return 0;
-
if (!task_cputime_zero(&tsk->cputime_expires)) {
struct task_cputime task_sample = {
.utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (sig->cputimer.running) {
struct task_cputime group_sample;
- thread_group_cputimer(tsk, &group_sample);
+ spin_lock(&sig->cputimer.lock);
+ group_sample = sig->cputimer.cputime;
+ spin_unlock(&sig->cputimer.lock);
+
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
}
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
{
LIST_HEAD(firing);
struct k_itimer *timer, *next;
+ unsigned long flags;
BUG_ON(!irqs_disabled());
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
if (!fastpath_timer_check(tsk))
return;
- spin_lock(&tsk->sighand->siglock);
+ if (!lock_task_sighand(tsk, &flags))
+ return;
/*
* Here we take off tsk->signal->cpu_timers[N] and
* tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* that gets the timer lock before we do will give it up and
* spin until we've taken care of that timer below.
*/
- spin_unlock(&tsk->sighand->siglock);
+ unlock_task_sighand(tsk, &flags);
/*
* Now that all the timers on our list have the firing flag,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ad723420acc3..9ca4973f736d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->it_clock = which_clock;
new_timer->it_overrun = -1;
- if (copy_to_user(created_timer_id,
- &new_timer_id, sizeof (new_timer_id))) {
- error = -EFAULT;
- goto out;
- }
if (timer_event_spec) {
if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
error = -EFAULT;
@@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
new_timer->sigq->info.si_tid = new_timer->it_id;
new_timer->sigq->info.si_code = SI_TIMER;
+ if (copy_to_user(created_timer_id,
+ &new_timer_id, sizeof (new_timer_id))) {
+ error = -EFAULT;
+ goto out;
+ }
+
error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
if (error)
goto out;
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 97024fd40cd5..83bbc7c02df9 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
static int submit(int rw, struct block_device *bdev, sector_t sector,
struct page *page, struct bio **bio_chain)
{
- const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+ const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..c77963938bca 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
*
* This file is released under the GPLv2.
@@ -277,7 +277,7 @@ static int create_image(int platform_mode)
goto Enable_irqs;
}
- if (hibernation_test(TEST_CORE))
+ if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
goto Power_up;
in_suspend = 1;
@@ -288,8 +288,10 @@ static int create_image(int platform_mode)
error);
/* Restore control flow magically appears here */
restore_processor_state();
- if (!in_suspend)
+ if (!in_suspend) {
+ events_check_enabled = false;
platform_leave(platform_mode);
+ }
Power_up:
sysdev_resume();
@@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
error = platform_begin(platform_mode);
if (error)
- return error;
+ goto Close;
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
@@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
+ hibernation_freeze_swap();
saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
@@ -511,18 +514,24 @@ int hibernation_platform_enter(void)
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
+ if (!pm_check_wakeup_events()) {
+ error = -EAGAIN;
+ goto Power_up;
+ }
+
hibernation_ops->enter();
/* We should never get here */
while (1);
- /*
- * We don't need to reenable the nonboot CPUs or resume consoles, since
- * the system is going to be halted anyway.
- */
+ Power_up:
+ sysdev_resume();
+ local_irq_enable();
+ enable_nonboot_cpus();
+
Platform_finish:
hibernation_ops->finish();
- dpm_suspend_noirq(PMSG_RESTORE);
+ dpm_resume_noirq(PMSG_RESTORE);
Resume_devices:
entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(state);
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The 'wakeup_count' attribute, along with the functions defined in
+ * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
+ * handled in a non-racy way.
+ *
+ * If a wakeup event occurs when the system is in a sleep state, it simply is
+ * woken up. In turn, if an event that would wake the system up from a sleep
+ * state occurs when it is undergoing a transition to that sleep state, the
+ * transition should be aborted. Moreover, if such an event occurs when the
+ * system is in the working state, an attempt to start a transition to the
+ * given sleep state should fail during certain period after the detection of
+ * the event. Using the 'state' attribute alone is not sufficient to satisfy
+ * these requirements, because a wakeup event may occur exactly when 'state'
+ * is being written to and may be delivered to user space right before it is
+ * frozen, so the event will remain only partially processed until the system is
+ * woken up by another event. In particular, it won't cause the transition to
+ * a sleep state to be aborted.
+ *
+ * This difficulty may be overcome if user space uses 'wakeup_count' before
+ * writing to 'state'. It first should read from 'wakeup_count' and store
+ * the read value. Then, after carrying out its own preparations for the system
+ * transition to a sleep state, it should write the stored value to
+ * 'wakeup_count'. If that fails, at least one wakeup event has occured since
+ * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
+ * is allowed to write to 'state', but the transition will be aborted if there
+ * are any wakeup events detected after 'wakeup_count' was written to.
+ */
+
+static ssize_t wakeup_count_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ unsigned long val;
+
+ return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
+}
+
+static ssize_t wakeup_count_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (sscanf(buf, "%lu", &val) == 1) {
+ if (pm_save_wakeup_count(val))
+ return n;
+ }
+ return -EINVAL;
+}
+
+power_attr(wakeup_count);
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
#endif
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
+ &wakeup_count_attr.attr,
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 71ae29052ab6..028a99598f49 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,7 @@
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/delay.h>
+#include <linux/workqueue.h>
/*
* Timeout for stopping processes
@@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only)
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
+ bool wq_busy = false;
struct timeval start, end;
u64 elapsed_csecs64;
unsigned int elapsed_csecs;
@@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only)
do_gettimeofday(&start);
end_time = jiffies + TIMEOUT;
+
+ if (!sig_only)
+ freeze_workqueues_begin();
+
while (true) {
todo = 0;
read_lock(&tasklist_lock);
@@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only)
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+
+ if (!sig_only) {
+ wq_busy = freeze_workqueues_busy();
+ todo += wq_busy;
+ }
+
if (!todo || time_after(jiffies, end_time))
break;
@@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only)
*/
printk("\n");
printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
- "(%d tasks refusing to freeze):\n",
- elapsed_csecs / 100, elapsed_csecs % 100, todo);
+ "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ elapsed_csecs / 100, elapsed_csecs % 100,
+ todo - wq_busy, wq_busy);
+
+ thaw_workqueues();
+
read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
@@ -157,6 +173,7 @@ void thaw_processes(void)
oom_killer_enable();
printk("Restarting tasks ... ");
+ thaw_workqueues();
thaw_tasks(true);
thaw_tasks(false);
schedule();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..5e7edfb05e66 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
*
* This file provides system snapshot/restore functionality for swsusp.
*
- * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
@@ -1086,6 +1086,7 @@ void swsusp_free(void)
buffer = NULL;
alloc_normal = 0;
alloc_highmem = 0;
+ hibernation_thaw_swap();
}
/* Helper functions used for the shrinking of memory. */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
- return error;
+ goto Platform_finish;
}
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
- goto Platfrom_finish;
+ goto Platform_finish;
}
if (suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
- goto Power_up_devices;
+ goto Platform_wake;
}
if (suspend_test(TEST_PLATFORM))
@@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
- if (!suspend_test(TEST_CORE))
+ if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
error = suspend_ops->enter(state);
+ events_check_enabled = false;
+ }
sysdev_resume();
}
@@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
if (suspend_ops->wake)
suspend_ops->wake();
- Power_up_devices:
dpm_resume_noirq(PMSG_RESUME);
- Platfrom_finish:
+ Platform_finish:
if (suspend_ops->finish)
suspend_ops->finish();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b0bb21778391..5d0059eed3e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
* This file provides functions for reading the suspend image from
* and writing it to a swap partition.
*
- * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
*
* This file is released under the GPLv2.
@@ -32,7 +32,7 @@
/*
* The swap map is a data structure used for keeping track of each page
* written to a swap partition. It consists of many swap_map_page
- * structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
* These structures are stored on the swap and linked together with the
* help of the .next_swap member.
*
@@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
- offset = swp_offset(get_swap_page_of_type(swap));
+ offset = swp_offset(get_swap_for_hibernation(swap));
if (offset) {
if (swsusp_extents_insert(offset))
- swap_free(swp_entry(swap, offset));
+ swap_free_for_hibernation(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
}
@@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
/**
* free_all_swap_pages - free swap pages allocated for saving image data.
- * It also frees the extents used to register which swap entres had been
+ * It also frees the extents used to register which swap entries had been
* allocated.
*/
@@ -163,7 +163,7 @@ void free_all_swap_pages(int swap)
ext = container_of(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
- swap_free(swp_entry(swap, offset));
+ swap_free_for_hibernation(swp_entry(swap, offset));
kfree(ext);
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..8fe465ac008a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,8 @@
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/uaccess.h>
@@ -985,6 +987,32 @@ void resume_console(void)
}
/**
+ * console_cpu_notify - print deferred console messages after CPU hotplug
+ * @self: notifier struct
+ * @action: CPU hotplug event
+ * @hcpu: unused
+ *
+ * If printk() is called from a CPU that is not online yet, the messages
+ * will be spooled but will not show up on the console. This function is
+ * called when a new CPU comes online (or fails to come up), and ensures
+ * that any such output gets printed.
+ */
+static int __cpuinit console_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ case CPU_DYING:
+ case CPU_DOWN_FAILED:
+ case CPU_UP_CANCELED:
+ acquire_console_sem();
+ release_console_sem();
+ }
+ return NOTIFY_OK;
+}
+
+/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
@@ -1371,7 +1399,7 @@ int unregister_console(struct console *console)
}
EXPORT_SYMBOL(unregister_console);
-static int __init disable_boot_consoles(void)
+static int __init printk_late_init(void)
{
struct console *con;
@@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void)
unregister_console(con);
}
}
+ hotcpu_notifier(console_cpu_notify, 0);
return 0;
}
-late_initcall(disable_boot_consoles);
+late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
@@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
chars = logged_chars;
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (logged_chars > end) {
- s1 = log_buf + log_buf_len - logged_chars + end;
- l1 = logged_chars - end;
+ if (chars > end) {
+ s1 = log_buf + log_buf_len - chars + end;
+ l1 = chars - end;
s2 = log_buf;
l2 = end;
@@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason)
s1 = "";
l1 = 0;
- s2 = log_buf + end - logged_chars;
- l2 = logged_chars;
+ s2 = log_buf + end - chars;
+ l2 = chars;
}
if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..f34d798ef4a2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
}
/*
- * Detach all tasks we were using ptrace on.
+ * Detach all tasks we were using ptrace on. Called with tasklist held
+ * for writing, and returns with it held too. But note it can release
+ * and reacquire the lock.
*/
void exit_ptrace(struct task_struct *tracer)
{
struct task_struct *p, *n;
LIST_HEAD(ptrace_dead);
- write_lock_irq(&tasklist_lock);
+ if (likely(list_empty(&tracer->ptraced)))
+ return;
+
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, &ptrace_dead);
}
- write_unlock_irq(&tasklist_lock);
+ write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&tracer->ptraced));
list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
+
+ write_lock_irq(&tasklist_lock);
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/range.c b/kernel/range.c
index 74e2e6114927..471b66acabb5 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -7,10 +7,6 @@
#include <linux/range.h>
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#endif
-
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
if (start >= end)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 72a8dc9567f5..4d169835fb36 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void)
}
EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
#endif /* #ifdef CONFIG_PROVE_RCU */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static inline void debug_init_rcu_head(struct rcu_head *head)
+{
+ debug_object_init(head, &rcuhead_debug_descr);
+}
+
+static inline void debug_rcu_head_free(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+
+/*
+ * fixup_init is called when:
+ * - an active object is initialized
+ */
+static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_init(head, &rcuhead_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_activate is called when:
+ * - an active object is activated
+ * - an unknown object is activated (might be a statically initialized object)
+ * Activation is performed internally by call_rcu().
+ */
+static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+
+ case ODEBUG_STATE_NOTAVAILABLE:
+ /*
+ * This is not really a fixup. We just make sure that it is
+ * tracked in the object tracker.
+ */
+ debug_object_init(head, &rcuhead_debug_descr);
+ debug_object_activate(head, &rcuhead_debug_descr);
+ return 0;
+
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_activate(head, &rcuhead_debug_descr);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * fixup_free is called when:
+ * - an active object is freed
+ */
+static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
+{
+ struct rcu_head *head = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_ACTIVE:
+ /*
+ * Ensure that queued callbacks are all executed.
+ * If we detect that we are nested in a RCU read-side critical
+ * section, we should simply fail, otherwise we would deadlock.
+ */
+#ifndef CONFIG_PREEMPT
+ WARN_ON(1);
+ return 0;
+#else
+ if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
+ irqs_disabled()) {
+ WARN_ON(1);
+ return 0;
+ }
+ rcu_barrier();
+ rcu_barrier_sched();
+ rcu_barrier_bh();
+ debug_object_free(head, &rcuhead_debug_descr);
+ return 1;
+#endif
+ default:
+ return 0;
+ }
+}
+
+/**
+ * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects of a new rcu_head structure that
+ * has been allocated as an auto variable on the stack. This function
+ * is not required for rcu_head structures that are statically defined or
+ * that are dynamically allocated on the heap. This function has no
+ * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void init_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_init_on_stack(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(init_rcu_head_on_stack);
+
+/**
+ * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects
+ * @head: pointer to rcu_head structure to be initialized
+ *
+ * This function informs debugobjects that an on-stack rcu_head structure
+ * is about to go out of scope. As with init_rcu_head_on_stack(), this
+ * function is not required for rcu_head structures that are statically
+ * defined or that are dynamically allocated on the heap. Also as with
+ * init_rcu_head_on_stack(), this function has no effect for
+ * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds.
+ */
+void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+ debug_object_free(head, &rcuhead_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
+
+struct debug_obj_descr rcuhead_debug_descr = {
+ .name = "rcu_head",
+ .fixup_init = rcuhead_fixup_init,
+ .fixup_activate = rcuhead_fixup_activate,
+ .fixup_free = rcuhead_fixup_free,
+};
+EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 38729d3cd236..196ec02f8be0 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
while (list) {
next = list->next;
prefetch(next);
+ debug_rcu_head_unqueue(list);
list->func(list);
list = next;
}
@@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head,
{
unsigned long flags;
+ debug_rcu_head_queue(head);
head->func = func;
head->next = NULL;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
rcu_random(struct rcu_random_state *rrsp)
{
if (--rrsp->rrs_count < 0) {
- rrsp->rrs_state +=
- (unsigned long)cpu_clock(raw_smp_processor_id());
+ rrsp->rrs_state += (unsigned long)local_clock();
rrsp->rrs_count = RCU_RANDOM_REFRESH;
}
rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d4437345706f..d5bc43976c5a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
while (list) {
next = list->next;
prefetch(next);
+ debug_rcu_head_unqueue(list);
list->func(list);
list = next;
if (++count >= rdp->blimit)
@@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
+ debug_rcu_head_queue(head);
head->func = func;
head->next = NULL;
diff --git a/kernel/sched.c b/kernel/sched.c
index 265cf3a2b5d8..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
#include <asm/irq_regs.h>
#include "sched_cpupri.h"
+#include "workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
unsigned long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ
u64 nohz_stamp;
- unsigned char in_nohz_recently;
+ unsigned char nohz_balance_kick;
#endif
unsigned int skip_clock_update;
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
#ifdef CONFIG_NO_HZ
/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu. This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+ int cpu = smp_processor_id();
+ int i;
+ struct sched_domain *sd;
+
+ for_each_domain(cpu, sd) {
+ for_each_cpu(i, sched_domain_span(sd))
+ if (!idle_cpu(i))
+ return i;
+ }
+ return cpu;
+}
+/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
* which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
-int nohz_ratelimit(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 diff = rq->clock - rq->nohz_stamp;
-
- rq->nohz_stamp = rq->clock;
-
- return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
if (root_task_group_empty())
return;
- now = cpu_clock(raw_smp_processor_id());
+ now = local_clock();
elapsed = now - sd->last_update;
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+ bool is_sync, bool is_migrate, bool is_local,
+ unsigned long en_flags)
+{
+ schedstat_inc(p, se.statistics.nr_wakeups);
+ if (is_sync)
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
+ if (is_migrate)
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+ if (is_local)
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+
+ activate_task(rq, p, en_flags);
+}
+
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+ int wake_flags, bool success)
+{
+ trace_sched_wakeup(p, success);
+ check_preempt_curr(rq, p, wake_flags);
+
+ p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_woken)
+ p->sched_class->task_woken(rq, p);
+
+ if (unlikely(rq->idle_stamp)) {
+ u64 delta = rq->clock - rq->idle_stamp;
+ u64 max = 2*sysctl_sched_migration_cost;
+
+ if (delta > max)
+ rq->avg_idle = max;
+ else
+ update_avg(&rq->avg_idle, delta);
+ rq->idle_stamp = 0;
+ }
+#endif
+ /* if a worker is waking up, notify workqueue */
+ if ((p->flags & PF_WQ_WORKER) && success)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/**
* try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
* @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state,
int wake_flags)
@@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.statistics.nr_wakeups);
- if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (orig_cpu != cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (cpu == this_cpu)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
- activate_task(rq, p, en_flags);
+ ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
+ cpu == this_cpu, en_flags);
success = 1;
-
out_running:
- trace_sched_wakeup(p, success);
- check_preempt_curr(rq, p, wake_flags);
-
- p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
- p->sched_class->task_woken(rq, p);
-
- if (unlikely(rq->idle_stamp)) {
- u64 delta = rq->clock - rq->idle_stamp;
- u64 max = 2*sysctl_sched_migration_cost;
-
- if (delta > max)
- rq->avg_idle = max;
- else
- update_avg(&rq->avg_idle, delta);
- rq->idle_stamp = 0;
- }
-#endif
+ ttwu_post_activation(p, rq, wake_flags, success);
out:
task_rq_unlock(rq, &flags);
put_cpu();
@@ -2399,6 +2431,37 @@ out:
}
/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task. this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+ bool success = false;
+
+ BUG_ON(rq != this_rq());
+ BUG_ON(p == current);
+ lockdep_assert_held(&rq->lock);
+
+ if (!(p->state & TASK_NORMAL))
+ return;
+
+ if (!p->se.on_rq) {
+ if (likely(!task_running(rq, p))) {
+ schedstat_inc(rq, ttwu_count);
+ schedstat_inc(rq, ttwu_local);
+ }
+ ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+ success = true;
+ }
+ ttwu_post_activation(p, rq, 0, success);
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
}
/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT 7
+static const unsigned char
+ degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+ degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+ {0, 0, 0, 0, 0, 0, 0, 0},
+ {64, 32, 8, 0, 0, 0, 0, 0},
+ {96, 72, 40, 12, 1, 0, 0},
+ {112, 98, 75, 43, 15, 1, 0},
+ {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+ int j = 0;
+
+ if (!missed_updates)
+ return load;
+
+ if (missed_updates >= degrade_zero_ticks[idx])
+ return 0;
+
+ if (idx == 1)
+ return load >> missed_updates;
+
+ while (missed_updates) {
+ if (missed_updates % 2)
+ load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+ missed_updates >>= 1;
+ j++;
+ }
+ return load;
+}
+
+/*
* Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
*/
static void update_cpu_load(struct rq *this_rq)
{
unsigned long this_load = this_rq->load.weight;
+ unsigned long curr_jiffies = jiffies;
+ unsigned long pending_updates;
int i, scale;
this_rq->nr_load_updates++;
+ /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+ if (curr_jiffies == this_rq->last_load_update_tick)
+ return;
+
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+ this_rq->last_load_update_tick = curr_jiffies;
+
/* Update our load: */
- for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
unsigned long old_load, new_load;
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i];
+ old_load = decay_load_missed(old_load, pending_updates - 1, i);
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
* example.
*/
if (new_load > old_load)
- new_load += scale-1;
- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+ new_load += scale - 1;
+
+ this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+ update_cpu_load(this_rq);
calc_load_account_active(this_rq);
}
@@ -3426,7 +3574,7 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
- update_cpu_load(rq);
+ update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
@@ -3598,7 +3746,6 @@ need_resched:
rq = cpu_rq(cpu);
rcu_note_context_switch(cpu);
prev = rq->curr;
- switch_count = &prev->nivcsw;
release_kernel_lock(prev);
need_resched_nonpreemptible:
@@ -3611,11 +3758,26 @@ need_resched_nonpreemptible:
raw_spin_lock_irq(&rq->lock);
clear_tsk_need_resched(prev);
+ switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely(signal_pending_state(prev->state, prev)))
+ if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
- else
+ } else {
+ /*
+ * If a worker is going to sleep, notify and
+ * ask workqueue whether it wants to wake up a
+ * task to maintain concurrency. If so, wake
+ * up the task.
+ */
+ if (prev->flags & PF_WQ_WORKER) {
+ struct task_struct *to_wakeup;
+
+ to_wakeup = wq_worker_sleeping(prev, cpu);
+ if (to_wakeup)
+ try_to_wake_up_local(to_wakeup);
+ }
deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ }
switch_count = &prev->nvcsw;
}
@@ -3637,8 +3799,10 @@ need_resched_nonpreemptible:
context_switch(rq, prev, next); /* unlocks the rq */
/*
- * the context switch might have flipped the stack from under
- * us, hence refresh the local variables.
+ * The context switch have flipped the stack from under us
+ * and restored the local variables which were saved when
+ * this task called schedule() in the past. prev == current
+ * is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3647,11 +3811,8 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(current) < 0)) {
- prev = rq->curr;
- switch_count = &prev->nivcsw;
+ if (unlikely(reacquire_kernel_lock(prev)))
goto need_resched_nonpreemptible;
- }
preempt_enable_no_resched();
if (need_resched())
@@ -4441,12 +4602,8 @@ recheck:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (rt_policy(policy)) {
- unsigned long rlim_rtprio;
-
- if (!lock_task_sighand(p, &flags))
- return -ESRCH;
- rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
- unlock_task_sighand(p, &flags);
+ unsigned long rlim_rtprio =
+ task_rlimit(p, RLIMIT_RTPRIO);
/* can't set/change the rt policy */
if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
*/
static struct notifier_block __cpuinitdata migration_notifier = {
.notifier_call = migration_call,
- .priority = 10
+ .priority = CPU_PRI_MIGRATION,
};
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ set_cpu_active((long)hcpu, true);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ set_cpu_active((long)hcpu, false);
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
- /* Start one for the boot CPU: */
+ /* Initialize migration for the boot CPU */
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
BUG_ON(err == NOTIFY_BAD);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+ /* Register cpu active notifiers */
+ cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+ cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
return 0;
}
early_initcall(migration_init);
@@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
free_rootdomain(old_rd);
}
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
{
- gfp_t gfp = GFP_KERNEL;
-
memset(rd, 0, sizeof(*rd));
- if (bootmem)
- gfp = GFP_NOWAIT;
-
- if (!alloc_cpumask_var(&rd->span, gfp))
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
- if (!alloc_cpumask_var(&rd->online, gfp))
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
goto free_span;
- if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_online;
- if (cpupri_init(&rd->cpupri, bootmem) != 0)
+ if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
return 0;
@@ -6096,7 +6277,7 @@ out:
static void init_defrootdomain(void)
{
- init_rootdomain(&def_root_domain, true);
+ init_rootdomain(&def_root_domain);
atomic_set(&def_root_domain.refcount, 1);
}
@@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
if (!rd)
return NULL;
- if (init_rootdomain(rd, false) != 0) {
+ if (init_rootdomain(rd) != 0) {
kfree(rd);
return NULL;
}
@@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-#ifndef CONFIG_CPUSETS
/*
- * Add online and remove offline CPUs from the scheduler domains.
- * When cpusets are enabled they take over this function.
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
*/
-static int update_sched_domains(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
{
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- partition_sched_domains(1, NULL, NULL);
+ cpuset_update_active_cpus();
return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus();
+ return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
-#endif
static int update_runtime(struct notifier_block *nfb,
unsigned long action, void *hcpu)
@@ -7356,10 +7543,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
-#ifndef CONFIG_CPUSETS
- /* XXX: Theoretical race here - CPU may be hotplugged now */
- hotcpu_notifier(update_sched_domains, 0);
-#endif
+ hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+ hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
/* RT runtime code needs to handle some hotplug events */
hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7789,9 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
+
+ rq->last_load_update_tick = jiffies;
+
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7617,6 +7805,10 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+ rq->nohz_balance_kick = 0;
+ init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7853,11 @@ void __init sched_init(void)
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ
- zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
+ atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+ atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
*
- * Create a semi stable clock from a mixture of other events, including:
- * - gtod
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i) -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock() -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ * - GTOD (clock monotomic)
* - sched_clock()
* - explicit idle events
*
- * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
- * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 2 jiffies difference).
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
return val;
}
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-unsigned long long cpu_clock(int cpu)
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
{
- unsigned long long clock;
+ u64 clock;
unsigned long flags;
local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
return clock;
}
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+ u64 clock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ clock = sched_clock_cpu(smp_processor_id());
+ local_irq_restore(flags);
+
+ return clock;
+}
+
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-
-unsigned long long cpu_clock(int cpu)
+u64 cpu_clock(int cpu)
{
return sched_clock_cpu(cpu);
}
+u64 local_clock(void)
+{
+ return sched_clock_cpu(0);
+}
+
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
*
* Returns: -ENOMEM if memory fails.
*/
-int cpupri_init(struct cpupri *cp, bool bootmem)
+int cpupri_init(struct cpupri *cp)
{
- gfp_t gfp = GFP_KERNEL;
int i;
- if (bootmem)
- gfp = GFP_NOWAIT;
-
memset(cp, 0, sizeof(*cp));
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
raw_spin_lock_init(&vec->lock);
vec->count = 0;
- if (!zalloc_cpumask_var(&vec->mask, gfp))
+ if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
goto cleanup;
}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
int cpupri_find(struct cpupri *cp,
struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp, bool bootmem);
+int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
#else
#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
- PN(sysctl_sched_child_runs_first);
+ P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#undef PN
#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
- else
- power *= default_scale_freq_power(sd, cpu);
-
- power >>= SCHED_LOAD_SHIFT;
-
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
if (sched_feat(ARCH_POWER))
power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
power >>= SCHED_LOAD_SHIFT;
}
+ sdg->cpu_power_orig = power;
+
+ if (sched_feat(ARCH_POWER))
+ power *= arch_scale_freq_power(sd, cpu);
+ else
+ power *= default_scale_freq_power(sd, cpu);
+
+ power >>= SCHED_LOAD_SHIFT;
+
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
sdg->cpu_power = power;
}
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+ /*
+ * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ */
+ if (sd->level != SD_LV_SIBLING)
+ return 0;
+
+ /*
+ * If ~90% of the cpu_power is still there, we're good.
+ */
+ if (group->cpu_power * 32 > group->cpu_power_orig * 29)
+ return 1;
+
+ return 0;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu) {
- *balance = 0;
- return;
+ if (idle != CPU_NEWLY_IDLE && local_group) {
+ if (balance_cpu != this_cpu) {
+ *balance = 0;
+ return;
+ }
+ update_group_power(sd, this_cpu);
}
- update_group_power(sd, this_cpu);
-
/* Adjust by relative CPU power of the group */
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(sd, group);
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ struct sched_group *sg,
+ struct sg_lb_stats *sgs,
+ int this_cpu)
+{
+ if (sgs->avg_load <= sds->max_load)
+ return false;
+
+ if (sgs->sum_nr_running > sgs->group_capacity)
+ return true;
+
+ if (sgs->group_imb)
+ return true;
+
+ /*
+ * ASYM_PACKING needs to move all the work to the lowest
+ * numbered CPUs in the group, therefore mark all groups
+ * higher than ourself as busy.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ this_cpu < group_first_cpu(sg)) {
+ if (!sds->busiest)
+ return true;
+
+ if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ return true;
+ }
+
+ return false;
}
/**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing group.
+ * @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
- struct sched_group *group = sd->groups;
+ struct sched_group *sg = sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
- sds->total_pwr += group->cpu_power;
+ sds->total_pwr += sg->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the group capacity to one so that we'll try
+ * first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away.
*/
if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
if (local_group) {
sds->this_load = sgs.avg_load;
- sds->this = group;
+ sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
- } else if (sgs.avg_load > sds->max_load &&
- (sgs.sum_nr_running > sgs.group_capacity ||
- sgs.group_imb)) {
+ } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
sds->max_load = sgs.avg_load;
- sds->busiest = group;
+ sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
sds->busiest_group_capacity = sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(group, sds, local_group, &sgs);
- group = group->next;
- } while (group != sd->groups);
+ update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+ sg = sg->next;
+ } while (sg != sd->groups);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ * sched doman.
+ *
+ * This is primarily intended to used at the sibling level. Some
+ * cores like POWER7 prefer to use lower numbered SMT threads. In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle. When in lower SMT modes, the threads will
+ * perform better since they share less core resources. Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads. It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on. Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU. The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+ struct sd_lb_stats *sds,
+ int this_cpu, unsigned long *imbalance)
+{
+ int busiest_cpu;
+
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return 0;
+
+ if (!sds->busiest)
+ return 0;
+
+ busiest_cpu = group_first_cpu(sds->busiest);
+ if (this_cpu > busiest_cpu)
+ return 0;
+
+ *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
+ SCHED_LOAD_SCALE);
+ return 1;
}
/**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!(*balance))
goto ret;
+ if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(sd, &sds, this_cpu, imbalance))
+ return sds.busiest;
+
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
@@ -2726,8 +2850,9 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *
-find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const struct cpumask *cpus)
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+ enum cpu_idle_type idle, unsigned long imbalance,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl;
+ if (!capacity)
+ capacity = fix_small_capacity(sd, group);
+
if (!cpumask_test_cpu(i, cpus))
continue;
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
+static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+ int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
+
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * higher numbered CPUs in order to pack all tasks in the
+ * lowest numbered CPUs.
+ */
+ if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ return 1;
+
/*
* The only task running in a non-idle cpu can be moved to this
* cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle)) {
+ if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
+ this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
}
#ifdef CONFIG_NO_HZ
+
+static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
+
+static void trigger_sched_softirq(void *data)
+{
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+}
+
+static inline void init_sched_softirq_csd(struct call_single_data *csd)
+{
+ csd->func = trigger_sched_softirq;
+ csd->info = NULL;
+ csd->flags = 0;
+ csd->priv = 0;
+}
+
+/*
+ * idle load balancing details
+ * - One of the idle CPUs nominates itself as idle load_balancer, while
+ * entering idle.
+ * - This idle load balancer CPU will also go into tickless mode when
+ * it is idle, just like all other idle CPUs
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
+ */
static struct {
atomic_t load_balancer;
- cpumask_var_t cpu_mask;
- cpumask_var_t ilb_grp_nohz_mask;
-} nohz ____cacheline_aligned = {
- .load_balancer = ATOMIC_INIT(-1),
-};
+ atomic_t first_pick_cpu;
+ atomic_t second_pick_cpu;
+ cpumask_var_t idle_cpus_mask;
+ cpumask_var_t grp_idle_mask;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
int get_nohz_load_balancer(void)
{
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
*/
static inline int is_semi_idle_group(struct sched_group *ilb_group)
{
- cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+ cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
sched_group_cpus(ilb_group));
/*
* A sched_group is semi-idle when it has atleast one busy cpu
* and atleast one idle cpu.
*/
- if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+ if (cpumask_empty(nohz.grp_idle_mask))
return 0;
- if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+ if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
return 0;
return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
* Optimize for the case when we have no idle CPUs or only one
* idle CPU. Don't walk the sched_domain hierarchy in such cases
*/
- if (cpumask_weight(nohz.cpu_mask) < 2)
+ if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
do {
if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.ilb_grp_nohz_mask);
+ return cpumask_first(nohz.grp_idle_mask);
ilb_group = ilb_group->next;
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
}
out_done:
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
{
- return cpumask_first(nohz.cpu_mask);
+ return nr_cpu_ids;
}
#endif
/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+ int ilb_cpu;
+
+ nohz.next_balance++;
+
+ ilb_cpu = get_nohz_load_balancer();
+
+ if (ilb_cpu >= nr_cpu_ids) {
+ ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
+ }
+
+ if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+ struct call_single_data *cp;
+
+ cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ cp = &per_cpu(remote_sched_softirq_cb, cpu);
+ __smp_call_function_single(ilb_cpu, cp, 0);
+ }
+ return;
+}
+
+/*
* This routine will try to nominate the ilb (idle load balancing)
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus. If all the cpus in the system
- * go into this tickless mode, then there will be no ilb owner (as there is
- * no need for one) and all the cpus will sleep till the next wakeup event
- * arrives...
- *
- * For the ilb owner, tick is not stopped. And this tick will be used
- * for idle load balancing. ilb owner will still be part of
- * nohz.cpu_mask..
+ * load balancing on behalf of all those cpus.
*
- * While stopping the tick, this cpu will become the ilb owner if there
- * is no other owner. And will be the owner till that cpu becomes busy
- * or if all cpus in the system stop their ticks at which point
- * there is no need for ilb owner.
+ * When the ilb owner becomes busy, we will not have new ilb owner until some
+ * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
+ * idle load balancing by kicking one of the idle CPUs.
*
- * When the ilb owner becomes busy, it nominates another owner, during the
- * next busy scheduler_tick()
+ * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
+ * ilb owner CPU in future (when there is a need for idle load balancing on
+ * behalf of all idle CPUs).
*/
-int select_nohz_load_balancer(int stop_tick)
+void select_nohz_load_balancer(int stop_tick)
{
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_rq(cpu)->in_nohz_recently = 1;
-
if (!cpu_active(cpu)) {
if (atomic_read(&nohz.load_balancer) != cpu)
- return 0;
+ return;
/*
* If we are going offline and still the leader,
* give up!
*/
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
- return 0;
+ return;
}
- cpumask_set_cpu(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- /* time for ilb owner also to sleep */
- if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
- if (atomic_read(&nohz.load_balancer) == cpu)
- atomic_set(&nohz.load_balancer, -1);
- return 0;
- }
+ if (atomic_read(&nohz.first_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+ if (atomic_read(&nohz.second_pick_cpu) == cpu)
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.load_balancer) == -1) {
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
- return 1;
- } else if (atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
int new_ilb;
- if (!(sched_smt_power_savings ||
- sched_mc_power_savings))
- return 1;
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ cpu) != nr_cpu_ids)
+ return;
+
/*
* Check to see if there is a more power-efficient
* ilb.
*/
new_ilb = find_new_ilb(cpu);
if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, -1);
+ atomic_set(&nohz.load_balancer, nr_cpu_ids);
resched_cpu(new_ilb);
- return 0;
+ return;
}
- return 1;
+ return;
}
} else {
- if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
- return 0;
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return;
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu,
+ nr_cpu_ids) != cpu)
BUG();
}
- return 0;
+ return;
}
#endif
@@ -3385,11 +3569,102 @@ out:
rq->next_balance = next_balance;
}
+#ifdef CONFIG_NO_HZ
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+ struct rq *this_rq = cpu_rq(this_cpu);
+ struct rq *rq;
+ int balance_cpu;
+
+ if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+ return;
+
+ for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched()) {
+ this_rq->nohz_balance_kick = 0;
+ break;
+ }
+
+ raw_spin_lock_irq(&this_rq->lock);
+ update_rq_clock(this_rq);
+ update_cpu_load(this_rq);
+ raw_spin_unlock_irq(&this_rq->lock);
+
+ rebalance_domains(balance_cpu, CPU_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(this_rq->next_balance, rq->next_balance))
+ this_rq->next_balance = rq->next_balance;
+ }
+ nohz.next_balance = this_rq->next_balance;
+ this_rq->nohz_balance_kick = 0;
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer
+ * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * idle load balancer when it has more than one process active. This
+ * eliminates the need for idle load balancing altogether when we have
+ * only one running process in the system (common case).
+ * - If there are more than one busy CPU, idle load balancer may have
+ * to run for active_load_balance to happen (i.e., two busy CPUs are
+ * SMT or core siblings and can run better if they move to different
+ * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
+ * which will kick idle load balancer as soon as it has any load.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+ unsigned long now = jiffies;
+ int ret;
+ int first_pick_cpu, second_pick_cpu;
+
+ if (time_before(now, nohz.next_balance))
+ return 0;
+
+ if (!rq->nr_running)
+ return 0;
+
+ first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+ second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+
+ if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+ second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ return 0;
+
+ ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ if (rq->nr_running > 1)
+ return 1;
+ } else {
+ ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+ if (ret == nr_cpu_ids || ret == cpu) {
+ if (rq->nr_running)
+ return 1;
+ }
+ }
+ return 0;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
rebalance_domains(this_cpu, idle);
-#ifdef CONFIG_NO_HZ
/*
- * If this cpu is the owner for idle load balancing, then do the
+ * If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
- if (this_rq->idle_at_tick &&
- atomic_read(&nohz.load_balancer) == this_cpu) {
- struct rq *rq;
- int balance_cpu;
-
- for_each_cpu(balance_cpu, nohz.cpu_mask) {
- if (balance_cpu == this_cpu)
- continue;
-
- /*
- * If this cpu gets work to do, stop the load balancing
- * work being done for other cpus. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
-
- rebalance_domains(balance_cpu, CPU_IDLE);
-
- rq = cpu_rq(balance_cpu);
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
- }
- }
-#endif
+ nohz_idle_balance(this_cpu, idle);
}
static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- *
- * In case of CONFIG_NO_HZ, this is the place where we nominate a new
- * idle load balancing owner or decide to stop the periodic load balancing,
- * if the whole system is idle.
*/
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
-#ifdef CONFIG_NO_HZ
- /*
- * If we were in the nohz mode recently and busy at the current
- * scheduler tick, then check if we need to nominate new idle
- * load balancer.
- */
- if (rq->in_nohz_recently && !rq->idle_at_tick) {
- rq->in_nohz_recently = 0;
-
- if (atomic_read(&nohz.load_balancer) == cpu) {
- cpumask_clear_cpu(cpu, nohz.cpu_mask);
- atomic_set(&nohz.load_balancer, -1);
- }
-
- if (atomic_read(&nohz.load_balancer) == -1) {
- int ilb = find_new_ilb(cpu);
-
- if (ilb < nr_cpu_ids)
- resched_cpu(ilb);
- }
- }
-
- /*
- * If this cpu is idle and doing idle load balancing for all the
- * cpus with ticks stopped, is it time for that to stop?
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
- resched_cpu(cpu);
- return;
- }
-
- /*
- * If this cpu is idle and the idle load balancing is done by
- * someone else, then no need raise the SCHED_SOFTIRQ
- */
- if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpumask_test_cpu(cpu, nohz.cpu_mask))
- return;
-#endif
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+ else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ nohz_balancer_kick(cpu);
+#endif
}
static void rq_online_fair(struct rq *rq)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
{
unsigned long soft, hard;
- if (!p->signal)
- return;
-
/* max may change after cur was read, this will be fixed next tick */
soft = task_rlimit(p, RLIMIT_RTTIME);
hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
static inline void account_group_user_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct thread_group_cputimer *cputimer;
-
- /* tsk == current, ensure it is safe to use ->signal */
- if (unlikely(tsk->exit_state))
- return;
-
- cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
static inline void account_group_system_time(struct task_struct *tsk,
cputime_t cputime)
{
- struct thread_group_cputimer *cputimer;
-
- /* tsk == current, ensure it is safe to use ->signal */
- if (unlikely(tsk->exit_state))
- return;
-
- cputimer = &tsk->signal->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
- struct thread_group_cputimer *cputimer;
- struct signal_struct *sig;
-
- sig = tsk->signal;
- /* see __exit_signal()->task_rq_unlock_wait() */
- barrier();
- if (unlikely(!sig))
- return;
-
- cputimer = &sig->cputimer;
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
if (!cputimer->running)
return;
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
@@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
/*
* send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- int ret = check_kill_permission(sig, info, p);
+ int ret;
+
+ rcu_read_lock();
+ ret = check_kill_permission(sig, info, p);
+ rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
deleted file mode 100644
index e45c43645298..000000000000
--- a/kernel/slow-work-debugfs.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT (BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER (~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
- seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
- struct timespec now, diff;
-
- now = CURRENT_TIME;
- diff = timespec_sub(now, work->mark);
-
- if (diff.tv_sec < 0)
- seq_puts(m, " -ve ");
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
- seq_printf(m, "%3luns ", diff.tv_nsec);
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
- seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
- else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
- seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
- else if (diff.tv_sec <= 1)
- seq_puts(m, " 1s ");
- else if (diff.tv_sec < 60)
- seq_printf(m, "%4lus ", diff.tv_sec);
- else if (diff.tv_sec < 60 * 60)
- seq_printf(m, "%4lum ", diff.tv_sec / 60);
- else if (diff.tv_sec < 60 * 60 * 24)
- seq_printf(m, "%4luh ", diff.tv_sec / 3600);
- else
- seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for debugfs
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
- struct slow_work *work;
- struct list_head *p = v;
- unsigned long id;
-
- switch ((unsigned long) v) {
- case 1:
- seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n");
- return 0;
- case 2:
- seq_puts(m, "=== ===== ================ == ===== ==========\n");
- return 0;
-
- case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
- id = (unsigned long) v - 3;
-
- read_lock(&slow_work_execs_lock);
- work = slow_work_execs[id];
- if (work) {
- smp_read_barrier_depends();
-
- seq_printf(m, "%3lu %5d %16p %2lx ",
- id, slow_work_pids[id], work, work->flags);
- slow_work_print_mark(m, work);
-
- if (work->ops->desc)
- work->ops->desc(work, m);
- seq_putc(m, '\n');
- }
- read_unlock(&slow_work_execs_lock);
- return 0;
-
- default:
- work = list_entry(p, struct slow_work, link);
- seq_printf(m, "%3s - %16p %2lx ",
- work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
- work, work->flags);
- slow_work_print_mark(m, work);
-
- if (work->ops->desc)
- work->ops->desc(work, m);
- seq_putc(m, '\n');
- return 0;
- }
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
- struct list_head *p;
- unsigned long count, id;
-
- switch (*_pos >> ITERATOR_SHIFT) {
- case 0x0:
- if (*_pos == 0)
- *_pos = 1;
- if (*_pos < 3)
- return (void *)(unsigned long) *_pos;
- if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
- for (id = *_pos - 3;
- id < SLOW_WORK_THREAD_LIMIT;
- id++, (*_pos)++)
- if (slow_work_execs[id])
- return (void *)(unsigned long) *_pos;
- *_pos = 0x1UL << ITERATOR_SHIFT;
-
- case 0x1:
- count = *_pos & ITERATOR_COUNTER;
- list_for_each(p, &slow_work_queue) {
- if (count == 0)
- return p;
- count--;
- }
- *_pos = 0x2UL << ITERATOR_SHIFT;
-
- case 0x2:
- count = *_pos & ITERATOR_COUNTER;
- list_for_each(p, &vslow_work_queue) {
- if (count == 0)
- return p;
- count--;
- }
- *_pos = 0x3UL << ITERATOR_SHIFT;
-
- default:
- return NULL;
- }
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
- spin_lock_irq(&slow_work_queue_lock);
- return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
- struct list_head *p = v;
- unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
- (*_pos)++;
- switch (selector) {
- case 0x0:
- return slow_work_runqueue_index(m, _pos);
-
- case 0x1:
- if (*_pos >> ITERATOR_SHIFT == 0x1) {
- p = p->next;
- if (p != &slow_work_queue)
- return p;
- }
- *_pos = 0x2UL << ITERATOR_SHIFT;
- p = &vslow_work_queue;
-
- case 0x2:
- if (*_pos >> ITERATOR_SHIFT == 0x2) {
- p = p->next;
- if (p != &vslow_work_queue)
- return p;
- }
- *_pos = 0x3UL << ITERATOR_SHIFT;
-
- default:
- return NULL;
- }
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
- spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
- .start = slow_work_runqueue_start,
- .stop = slow_work_runqueue_stop,
- .next = slow_work_runqueue_next,
- .show = slow_work_runqueue_show,
-};
-
-/*
- * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
- .owner = THIS_MODULE,
- .open = slow_work_runqueue_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
deleted file mode 100644
index 7d3f4fa9ef4f..000000000000
--- a/kernel/slow-work.c
+++ /dev/null
@@ -1,1068 +0,0 @@
-/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- *
- * See Documentation/slow-work.txt
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/wait.h>
-#include <linux/debugfs.h>
-#include "slow-work.h"
-
-static void slow_work_cull_timeout(unsigned long);
-static void slow_work_oom_timeout(unsigned long);
-
-#ifdef CONFIG_SYSCTL
-static int slow_work_min_threads_sysctl(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-
-static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
-#endif
-
-/*
- * The pool of threads has at least min threads in it as long as someone is
- * using the facility, and may have as many as max.
- *
- * A portion of the pool may be processing very slow operations.
- */
-static unsigned slow_work_min_threads = 2;
-static unsigned slow_work_max_threads = 4;
-static unsigned vslow_work_proportion = 50; /* % of threads that may process
- * very slow work */
-
-#ifdef CONFIG_SYSCTL
-static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
-static const int slow_work_min_vslow = 1;
-static const int slow_work_max_vslow = 99;
-
-ctl_table slow_work_sysctls[] = {
- {
- .procname = "min-threads",
- .data = &slow_work_min_threads,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = slow_work_min_threads_sysctl,
- .extra1 = (void *) &slow_work_min_min_threads,
- .extra2 = &slow_work_max_threads,
- },
- {
- .procname = "max-threads",
- .data = &slow_work_max_threads,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = slow_work_max_threads_sysctl,
- .extra1 = &slow_work_min_threads,
- .extra2 = (void *) &slow_work_max_max_threads,
- },
- {
- .procname = "vslow-percentage",
- .data = &vslow_work_proportion,
- .maxlen = sizeof(unsigned),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *) &slow_work_min_vslow,
- .extra2 = (void *) &slow_work_max_vslow,
- },
- {}
-};
-#endif
-
-/*
- * The active state of the thread pool
- */
-static atomic_t slow_work_thread_count;
-static atomic_t vslow_work_executing_count;
-
-static bool slow_work_may_not_start_new_thread;
-static bool slow_work_cull; /* cull a thread due to lack of activity */
-static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
-static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
-static struct slow_work slow_work_new_thread; /* new thread starter */
-
-/*
- * slow work ID allocation (use slow_work_queue_lock)
- */
-static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
-
-/*
- * Unregistration tracking to prevent put_ref() from disappearing during module
- * unload
- */
-#ifdef CONFIG_MODULES
-static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
-static struct module *slow_work_unreg_module;
-static struct slow_work *slow_work_unreg_work_item;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
-static DEFINE_MUTEX(slow_work_unreg_sync_lock);
-
-static void slow_work_set_thread_processing(int id, struct slow_work *work)
-{
- if (work)
- slow_work_thread_processing[id] = work->owner;
-}
-static void slow_work_done_thread_processing(int id, struct slow_work *work)
-{
- struct module *module = slow_work_thread_processing[id];
-
- slow_work_thread_processing[id] = NULL;
- smp_mb();
- if (slow_work_unreg_work_item == work ||
- slow_work_unreg_module == module)
- wake_up_all(&slow_work_unreg_wq);
-}
-static void slow_work_clear_thread_processing(int id)
-{
- slow_work_thread_processing[id] = NULL;
-}
-#else
-static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
-static void slow_work_clear_thread_processing(int id) {}
-#endif
-
-/*
- * Data for tracking currently executing items for indication through /proc
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
-pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
-DEFINE_RWLOCK(slow_work_execs_lock);
-#endif
-
-/*
- * The queues of work items and the lock governing access to them. These are
- * shared between all the CPUs. It doesn't make sense to have per-CPU queues
- * as the number of threads bears no relation to the number of CPUs.
- *
- * There are two queues of work items: one for slow work items, and one for
- * very slow work items.
- */
-LIST_HEAD(slow_work_queue);
-LIST_HEAD(vslow_work_queue);
-DEFINE_SPINLOCK(slow_work_queue_lock);
-
-/*
- * The following are two wait queues that get pinged when a work item is placed
- * on an empty queue. These allow work items that are hogging a thread by
- * sleeping in a way that could be deferred to yield their thread and enqueue
- * themselves.
- */
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
-static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
-
-/*
- * The thread controls. A variable used to signal to the threads that they
- * should exit when the queue is empty, a waitqueue used by the threads to wait
- * for signals, and a completion set by the last thread to exit.
- */
-static bool slow_work_threads_should_exit;
-static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
-static DECLARE_COMPLETION(slow_work_last_thread_exited);
-
-/*
- * The number of users of the thread pool and its lock. Whilst this is zero we
- * have no threads hanging around, and when this reaches zero, we wait for all
- * active or queued work items to complete and kill all the threads we do have.
- */
-static int slow_work_user_count;
-static DEFINE_MUTEX(slow_work_user_lock);
-
-static inline int slow_work_get_ref(struct slow_work *work)
-{
- if (work->ops->get_ref)
- return work->ops->get_ref(work);
-
- return 0;
-}
-
-static inline void slow_work_put_ref(struct slow_work *work)
-{
- if (work->ops->put_ref)
- work->ops->put_ref(work);
-}
-
-/*
- * Calculate the maximum number of active threads in the pool that are
- * permitted to process very slow work items.
- *
- * The answer is rounded up to at least 1, but may not equal or exceed the
- * maximum number of the threads in the pool. This means we always have at
- * least one thread that can process slow work items, and we always have at
- * least one thread that won't get tied up doing so.
- */
-static unsigned slow_work_calc_vsmax(void)
-{
- unsigned vsmax;
-
- vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
- vsmax /= 100;
- vsmax = max(vsmax, 1U);
- return min(vsmax, slow_work_max_threads - 1);
-}
-
-/*
- * Attempt to execute stuff queued on a slow thread. Return true if we managed
- * it, false if there was nothing to do.
- */
-static noinline bool slow_work_execute(int id)
-{
- struct slow_work *work = NULL;
- unsigned vsmax;
- bool very_slow;
-
- vsmax = slow_work_calc_vsmax();
-
- /* see if we can schedule a new thread to be started if we're not
- * keeping up with the work */
- if (!waitqueue_active(&slow_work_thread_wq) &&
- (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
- atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
- !slow_work_may_not_start_new_thread)
- slow_work_enqueue(&slow_work_new_thread);
-
- /* find something to execute */
- spin_lock_irq(&slow_work_queue_lock);
- if (!list_empty(&vslow_work_queue) &&
- atomic_read(&vslow_work_executing_count) < vsmax) {
- work = list_entry(vslow_work_queue.next,
- struct slow_work, link);
- if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
- BUG();
- list_del_init(&work->link);
- atomic_inc(&vslow_work_executing_count);
- very_slow = true;
- } else if (!list_empty(&slow_work_queue)) {
- work = list_entry(slow_work_queue.next,
- struct slow_work, link);
- if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
- BUG();
- list_del_init(&work->link);
- very_slow = false;
- } else {
- very_slow = false; /* avoid the compiler warning */
- }
-
- slow_work_set_thread_processing(id, work);
- if (work) {
- slow_work_mark_time(work);
- slow_work_begin_exec(id, work);
- }
-
- spin_unlock_irq(&slow_work_queue_lock);
-
- if (!work)
- return false;
-
- if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
- BUG();
-
- /* don't execute if the work is in the process of being cancelled */
- if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
- work->ops->execute(work);
-
- if (very_slow)
- atomic_dec(&vslow_work_executing_count);
- clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
-
- /* wake up anyone waiting for this work to be complete */
- wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
-
- slow_work_end_exec(id, work);
-
- /* if someone tried to enqueue the item whilst we were executing it,
- * then it'll be left unenqueued to avoid multiple threads trying to
- * execute it simultaneously
- *
- * there is, however, a race between us testing the pending flag and
- * getting the spinlock, and between the enqueuer setting the pending
- * flag and getting the spinlock, so we use a deferral bit to tell us
- * if the enqueuer got there first
- */
- if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
- spin_lock_irq(&slow_work_queue_lock);
-
- if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
- test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
- goto auto_requeue;
-
- spin_unlock_irq(&slow_work_queue_lock);
- }
-
- /* sort out the race between module unloading and put_ref() */
- slow_work_put_ref(work);
- slow_work_done_thread_processing(id, work);
-
- return true;
-
-auto_requeue:
- /* we must complete the enqueue operation
- * - we transfer our ref on the item back to the appropriate queue
- * - don't wake another thread up as we're awake already
- */
- slow_work_mark_time(work);
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
- list_add_tail(&work->link, &vslow_work_queue);
- else
- list_add_tail(&work->link, &slow_work_queue);
- spin_unlock_irq(&slow_work_queue_lock);
- slow_work_clear_thread_processing(id);
- return true;
-}
-
-/**
- * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
- * work: The work item under execution that wants to sleep
- * _timeout: Scheduler sleep timeout
- *
- * Allow a requeueable work item to sleep on a slow-work processor thread until
- * that thread is needed to do some other work or the sleep is interrupted by
- * some other event.
- *
- * The caller must set up a wake up event before calling this and must have set
- * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
- * condition before calling this function as no test is made here.
- *
- * False is returned if there is nothing on the queue; true is returned if the
- * work item should be requeued
- */
-bool slow_work_sleep_till_thread_needed(struct slow_work *work,
- signed long *_timeout)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
-
- DEFINE_WAIT(wait);
-
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- if (!list_empty(queue))
- return true;
-
- add_wait_queue_exclusive(wfo_wq, &wait);
- if (list_empty(queue))
- *_timeout = schedule_timeout(*_timeout);
- finish_wait(wfo_wq, &wait);
-
- return !list_empty(queue);
-}
-EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
-
-/**
- * slow_work_enqueue - Schedule a slow work item for processing
- * @work: The work item to queue
- *
- * Schedule a slow work item for processing. If the item is already undergoing
- * execution, this guarantees not to re-enter the execution routine until the
- * first execution finishes.
- *
- * The item is pinned by this function as it retains a reference to it, managed
- * through the item operations. The item is unpinned once it has been
- * executed.
- *
- * An item may hog the thread that is running it for a relatively large amount
- * of time, sufficient, for example, to perform several lookup, mkdir, create
- * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
- *
- * Conversely, if a number of items are awaiting processing, it may take some
- * time before any given item is given attention. The number of threads in the
- * pool may be increased to deal with demand, but only up to a limit.
- *
- * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
- * the very slow queue, from which only a portion of the threads will be
- * allowed to pick items to execute. This ensures that very slow items won't
- * overly block ones that are just ordinarily slow.
- *
- * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
- * attempted queued)
- */
-int slow_work_enqueue(struct slow_work *work)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
- unsigned long flags;
- int ret;
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- return -ECANCELED;
-
- BUG_ON(slow_work_user_count <= 0);
- BUG_ON(!work);
- BUG_ON(!work->ops);
-
- /* when honouring an enqueue request, we only promise that we will run
- * the work function in the future; we do not promise to run it once
- * per enqueue request
- *
- * we use the PENDING bit to merge together repeat requests without
- * having to disable IRQs and take the spinlock, whilst still
- * maintaining our promise
- */
- if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
- goto cancelled;
-
- /* we promise that we will not attempt to execute the work
- * function in more than one thread simultaneously
- *
- * this, however, leaves us with a problem if we're asked to
- * enqueue the work whilst someone is executing the work
- * function as simply queueing the work immediately means that
- * another thread may try executing it whilst it is already
- * under execution
- *
- * to deal with this, we set the ENQ_DEFERRED bit instead of
- * enqueueing, and the thread currently executing the work
- * function will enqueue the work item when the work function
- * returns and it has cleared the EXECUTING bit
- */
- if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
- set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
- } else {
- ret = slow_work_get_ref(work);
- if (ret < 0)
- goto failed;
- slow_work_mark_time(work);
- list_add_tail(&work->link, queue);
- wake_up(&slow_work_thread_wq);
-
- /* if someone who could be requeued is sleeping on a
- * thread, then ask them to yield their thread */
- if (work->link.prev == queue)
- wake_up(wfo_wq);
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- }
- return 0;
-
-cancelled:
- ret = -ECANCELED;
-failed:
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(slow_work_enqueue);
-
-static int slow_work_wait(void *word)
-{
- schedule();
- return 0;
-}
-
-/**
- * slow_work_cancel - Cancel a slow work item
- * @work: The work item to cancel
- *
- * This function will cancel a previously enqueued work item. If we cannot
- * cancel the work item, it is guarenteed to have run when this function
- * returns.
- */
-void slow_work_cancel(struct slow_work *work)
-{
- bool wait = true, put = false;
-
- set_bit(SLOW_WORK_CANCELLING, &work->flags);
- smp_mb();
-
- /* if the work item is a delayed work item with an active timer, we
- * need to wait for the timer to finish _before_ getting the spinlock,
- * lest we deadlock against the timer routine
- *
- * the timer routine will leave DELAYED set if it notices the
- * CANCELLING flag in time
- */
- if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
- struct delayed_slow_work *dwork =
- container_of(work, struct delayed_slow_work, work);
- del_timer_sync(&dwork->timer);
- }
-
- spin_lock_irq(&slow_work_queue_lock);
-
- if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
- /* the timer routine aborted or never happened, so we are left
- * holding the timer's reference on the item and should just
- * drop the pending flag and wait for any ongoing execution to
- * finish */
- struct delayed_slow_work *dwork =
- container_of(work, struct delayed_slow_work, work);
-
- BUG_ON(timer_pending(&dwork->timer));
- BUG_ON(!list_empty(&work->link));
-
- clear_bit(SLOW_WORK_DELAYED, &work->flags);
- put = true;
- clear_bit(SLOW_WORK_PENDING, &work->flags);
-
- } else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
- !list_empty(&work->link)) {
- /* the link in the pending queue holds a reference on the item
- * that we will need to release */
- list_del_init(&work->link);
- wait = false;
- put = true;
- clear_bit(SLOW_WORK_PENDING, &work->flags);
-
- } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
- /* the executor is holding our only reference on the item, so
- * we merely need to wait for it to finish executing */
- clear_bit(SLOW_WORK_PENDING, &work->flags);
- }
-
- spin_unlock_irq(&slow_work_queue_lock);
-
- /* the EXECUTING flag is set by the executor whilst the spinlock is set
- * and before the item is dequeued - so assuming the above doesn't
- * actually dequeue it, simply waiting for the EXECUTING flag to be
- * released here should be sufficient */
- if (wait)
- wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
- TASK_UNINTERRUPTIBLE);
-
- clear_bit(SLOW_WORK_CANCELLING, &work->flags);
- if (put)
- slow_work_put_ref(work);
-}
-EXPORT_SYMBOL(slow_work_cancel);
-
-/*
- * Handle expiry of the delay timer, indicating that a delayed slow work item
- * should now be queued if not cancelled
- */
-static void delayed_slow_work_timer(unsigned long data)
-{
- wait_queue_head_t *wfo_wq;
- struct list_head *queue;
- struct slow_work *work = (struct slow_work *) data;
- unsigned long flags;
- bool queued = false, put = false, first = false;
-
- if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
- wfo_wq = &vslow_work_queue_waits_for_occupation;
- queue = &vslow_work_queue;
- } else {
- wfo_wq = &slow_work_queue_waits_for_occupation;
- queue = &slow_work_queue;
- }
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
- if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
- clear_bit(SLOW_WORK_DELAYED, &work->flags);
-
- if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
- /* we discard the reference the timer was holding in
- * favour of the one the executor holds */
- set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
- put = true;
- } else {
- slow_work_mark_time(work);
- list_add_tail(&work->link, queue);
- queued = true;
- if (work->link.prev == queue)
- first = true;
- }
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- if (put)
- slow_work_put_ref(work);
- if (first)
- wake_up(wfo_wq);
- if (queued)
- wake_up(&slow_work_thread_wq);
-}
-
-/**
- * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
- * @dwork: The delayed work item to queue
- * @delay: When to start executing the work, in jiffies from now
- *
- * This is similar to slow_work_enqueue(), but it adds a delay before the work
- * is actually queued for processing.
- *
- * The item can have delayed processing requested on it whilst it is being
- * executed. The delay will begin immediately, and if it expires before the
- * item finishes executing, the item will be placed back on the queue when it
- * has done executing.
- */
-int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
- unsigned long delay)
-{
- struct slow_work *work = &dwork->work;
- unsigned long flags;
- int ret;
-
- if (delay == 0)
- return slow_work_enqueue(&dwork->work);
-
- BUG_ON(slow_work_user_count <= 0);
- BUG_ON(!work);
- BUG_ON(!work->ops);
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- return -ECANCELED;
-
- if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
- goto cancelled;
-
- /* the timer holds a reference whilst it is pending */
- ret = slow_work_get_ref(work);
- if (ret < 0)
- goto cant_get_ref;
-
- if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
- BUG();
- dwork->timer.expires = jiffies + delay;
- dwork->timer.data = (unsigned long) work;
- dwork->timer.function = delayed_slow_work_timer;
- add_timer(&dwork->timer);
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- }
-
- return 0;
-
-cancelled:
- ret = -ECANCELED;
-cant_get_ref:
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return ret;
-}
-EXPORT_SYMBOL(delayed_slow_work_enqueue);
-
-/*
- * Schedule a cull of the thread pool at some time in the near future
- */
-static void slow_work_schedule_cull(void)
-{
- mod_timer(&slow_work_cull_timer,
- round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
-}
-
-/*
- * Worker thread culling algorithm
- */
-static bool slow_work_cull_thread(void)
-{
- unsigned long flags;
- bool do_cull = false;
-
- spin_lock_irqsave(&slow_work_queue_lock, flags);
-
- if (slow_work_cull) {
- slow_work_cull = false;
-
- if (list_empty(&slow_work_queue) &&
- list_empty(&vslow_work_queue) &&
- atomic_read(&slow_work_thread_count) >
- slow_work_min_threads) {
- slow_work_schedule_cull();
- do_cull = true;
- }
- }
-
- spin_unlock_irqrestore(&slow_work_queue_lock, flags);
- return do_cull;
-}
-
-/*
- * Determine if there is slow work available for dispatch
- */
-static inline bool slow_work_available(int vsmax)
-{
- return !list_empty(&slow_work_queue) ||
- (!list_empty(&vslow_work_queue) &&
- atomic_read(&vslow_work_executing_count) < vsmax);
-}
-
-/*
- * Worker thread dispatcher
- */
-static int slow_work_thread(void *_data)
-{
- int vsmax, id;
-
- DEFINE_WAIT(wait);
-
- set_freezable();
- set_user_nice(current, -5);
-
- /* allocate ourselves an ID */
- spin_lock_irq(&slow_work_queue_lock);
- id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
- BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
- __set_bit(id, slow_work_ids);
- slow_work_set_thread_pid(id, current->pid);
- spin_unlock_irq(&slow_work_queue_lock);
-
- sprintf(current->comm, "kslowd%03u", id);
-
- for (;;) {
- vsmax = vslow_work_proportion;
- vsmax *= atomic_read(&slow_work_thread_count);
- vsmax /= 100;
-
- prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
- TASK_INTERRUPTIBLE);
- if (!freezing(current) &&
- !slow_work_threads_should_exit &&
- !slow_work_available(vsmax) &&
- !slow_work_cull)
- schedule();
- finish_wait(&slow_work_thread_wq, &wait);
-
- try_to_freeze();
-
- vsmax = vslow_work_proportion;
- vsmax *= atomic_read(&slow_work_thread_count);
- vsmax /= 100;
-
- if (slow_work_available(vsmax) && slow_work_execute(id)) {
- cond_resched();
- if (list_empty(&slow_work_queue) &&
- list_empty(&vslow_work_queue) &&
- atomic_read(&slow_work_thread_count) >
- slow_work_min_threads)
- slow_work_schedule_cull();
- continue;
- }
-
- if (slow_work_threads_should_exit)
- break;
-
- if (slow_work_cull && slow_work_cull_thread())
- break;
- }
-
- spin_lock_irq(&slow_work_queue_lock);
- slow_work_set_thread_pid(id, 0);
- __clear_bit(id, slow_work_ids);
- spin_unlock_irq(&slow_work_queue_lock);
-
- if (atomic_dec_and_test(&slow_work_thread_count))
- complete_and_exit(&slow_work_last_thread_exited, 0);
- return 0;
-}
-
-/*
- * Handle thread cull timer expiration
- */
-static void slow_work_cull_timeout(unsigned long data)
-{
- slow_work_cull = true;
- wake_up(&slow_work_thread_wq);
-}
-
-/*
- * Start a new slow work thread
- */
-static void slow_work_new_thread_execute(struct slow_work *work)
-{
- struct task_struct *p;
-
- if (slow_work_threads_should_exit)
- return;
-
- if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
- return;
-
- if (!mutex_trylock(&slow_work_user_lock))
- return;
-
- slow_work_may_not_start_new_thread = true;
- atomic_inc(&slow_work_thread_count);
- p = kthread_run(slow_work_thread, NULL, "kslowd");
- if (IS_ERR(p)) {
- printk(KERN_DEBUG "Slow work thread pool: OOM\n");
- if (atomic_dec_and_test(&slow_work_thread_count))
- BUG(); /* we're running on a slow work thread... */
- mod_timer(&slow_work_oom_timer,
- round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
- } else {
- /* ratelimit the starting of new threads */
- mod_timer(&slow_work_oom_timer, jiffies + 1);
- }
-
- mutex_unlock(&slow_work_user_lock);
-}
-
-static const struct slow_work_ops slow_work_new_thread_ops = {
- .owner = THIS_MODULE,
- .execute = slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_DEBUG
- .desc = slow_work_new_thread_desc,
-#endif
-};
-
-/*
- * post-OOM new thread start suppression expiration
- */
-static void slow_work_oom_timeout(unsigned long data)
-{
- slow_work_may_not_start_new_thread = false;
-}
-
-#ifdef CONFIG_SYSCTL
-/*
- * Handle adjustment of the minimum number of threads
- */
-static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int n;
-
- if (ret == 0) {
- mutex_lock(&slow_work_user_lock);
- if (slow_work_user_count > 0) {
- /* see if we need to start or stop threads */
- n = atomic_read(&slow_work_thread_count) -
- slow_work_min_threads;
-
- if (n < 0 && !slow_work_may_not_start_new_thread)
- slow_work_enqueue(&slow_work_new_thread);
- else if (n > 0)
- slow_work_schedule_cull();
- }
- mutex_unlock(&slow_work_user_lock);
- }
-
- return ret;
-}
-
-/*
- * Handle adjustment of the maximum number of threads
- */
-static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- int n;
-
- if (ret == 0) {
- mutex_lock(&slow_work_user_lock);
- if (slow_work_user_count > 0) {
- /* see if we need to stop threads */
- n = slow_work_max_threads -
- atomic_read(&slow_work_thread_count);
-
- if (n < 0)
- slow_work_schedule_cull();
- }
- mutex_unlock(&slow_work_user_lock);
- }
-
- return ret;
-}
-#endif /* CONFIG_SYSCTL */
-
-/**
- * slow_work_register_user - Register a user of the facility
- * @module: The module about to make use of the facility
- *
- * Register a user of the facility, starting up the initial threads if there
- * aren't any other users at this point. This will return 0 if successful, or
- * an error if not.
- */
-int slow_work_register_user(struct module *module)
-{
- struct task_struct *p;
- int loop;
-
- mutex_lock(&slow_work_user_lock);
-
- if (slow_work_user_count == 0) {
- printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
- init_completion(&slow_work_last_thread_exited);
-
- slow_work_threads_should_exit = false;
- slow_work_init(&slow_work_new_thread,
- &slow_work_new_thread_ops);
- slow_work_may_not_start_new_thread = false;
- slow_work_cull = false;
-
- /* start the minimum number of threads */
- for (loop = 0; loop < slow_work_min_threads; loop++) {
- atomic_inc(&slow_work_thread_count);
- p = kthread_run(slow_work_thread, NULL, "kslowd");
- if (IS_ERR(p))
- goto error;
- }
- printk(KERN_NOTICE "Slow work thread pool: Ready\n");
- }
-
- slow_work_user_count++;
- mutex_unlock(&slow_work_user_lock);
- return 0;
-
-error:
- if (atomic_dec_and_test(&slow_work_thread_count))
- complete(&slow_work_last_thread_exited);
- if (loop > 0) {
- printk(KERN_ERR "Slow work thread pool:"
- " Aborting startup on ENOMEM\n");
- slow_work_threads_should_exit = true;
- wake_up_all(&slow_work_thread_wq);
- wait_for_completion(&slow_work_last_thread_exited);
- printk(KERN_ERR "Slow work thread pool: Aborted\n");
- }
- mutex_unlock(&slow_work_user_lock);
- return PTR_ERR(p);
-}
-EXPORT_SYMBOL(slow_work_register_user);
-
-/*
- * wait for all outstanding items from the calling module to complete
- * - note that more items may be queued whilst we're waiting
- */
-static void slow_work_wait_for_items(struct module *module)
-{
-#ifdef CONFIG_MODULES
- DECLARE_WAITQUEUE(myself, current);
- struct slow_work *work;
- int loop;
-
- mutex_lock(&slow_work_unreg_sync_lock);
- add_wait_queue(&slow_work_unreg_wq, &myself);
-
- for (;;) {
- spin_lock_irq(&slow_work_queue_lock);
-
- /* first of all, we wait for the last queued item in each list
- * to be processed */
- list_for_each_entry_reverse(work, &vslow_work_queue, link) {
- if (work->owner == module) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- slow_work_unreg_work_item = work;
- goto do_wait;
- }
- }
- list_for_each_entry_reverse(work, &slow_work_queue, link) {
- if (work->owner == module) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- slow_work_unreg_work_item = work;
- goto do_wait;
- }
- }
-
- /* then we wait for the items being processed to finish */
- slow_work_unreg_module = module;
- smp_mb();
- for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
- if (slow_work_thread_processing[loop] == module)
- goto do_wait;
- }
- spin_unlock_irq(&slow_work_queue_lock);
- break; /* okay, we're done */
-
- do_wait:
- spin_unlock_irq(&slow_work_queue_lock);
- schedule();
- slow_work_unreg_work_item = NULL;
- slow_work_unreg_module = NULL;
- }
-
- remove_wait_queue(&slow_work_unreg_wq, &myself);
- mutex_unlock(&slow_work_unreg_sync_lock);
-#endif /* CONFIG_MODULES */
-}
-
-/**
- * slow_work_unregister_user - Unregister a user of the facility
- * @module: The module whose items should be cleared
- *
- * Unregister a user of the facility, killing all the threads if this was the
- * last one.
- *
- * This waits for all the work items belonging to the nominated module to go
- * away before proceeding.
- */
-void slow_work_unregister_user(struct module *module)
-{
- /* first of all, wait for all outstanding items from the calling module
- * to complete */
- if (module)
- slow_work_wait_for_items(module);
-
- /* then we can actually go about shutting down the facility if need
- * be */
- mutex_lock(&slow_work_user_lock);
-
- BUG_ON(slow_work_user_count <= 0);
-
- slow_work_user_count--;
- if (slow_work_user_count == 0) {
- printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
- slow_work_threads_should_exit = true;
- del_timer_sync(&slow_work_cull_timer);
- del_timer_sync(&slow_work_oom_timer);
- wake_up_all(&slow_work_thread_wq);
- wait_for_completion(&slow_work_last_thread_exited);
- printk(KERN_NOTICE "Slow work thread pool:"
- " Shut down complete\n");
- }
-
- mutex_unlock(&slow_work_user_lock);
-}
-EXPORT_SYMBOL(slow_work_unregister_user);
-
-/*
- * Initialise the slow work facility
- */
-static int __init init_slow_work(void)
-{
- unsigned nr_cpus = num_possible_cpus();
-
- if (slow_work_max_threads < nr_cpus)
- slow_work_max_threads = nr_cpus;
-#ifdef CONFIG_SYSCTL
- if (slow_work_max_max_threads < nr_cpus * 2)
- slow_work_max_max_threads = nr_cpus * 2;
-#endif
-#ifdef CONFIG_SLOW_WORK_DEBUG
- {
- struct dentry *dbdir;
-
- dbdir = debugfs_create_dir("slow_work", NULL);
- if (dbdir && !IS_ERR(dbdir))
- debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
- NULL, &slow_work_runqueue_fops);
- }
-#endif
- return 0;
-}
-
-subsys_initcall(init_slow_work);
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
deleted file mode 100644
index a29ebd1ef41d..000000000000
--- a/kernel/slow-work.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Slow work private definitions
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
- * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
- * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */
-
-/*
- * slow-work.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern struct slow_work *slow_work_execs[];
-extern pid_t slow_work_pids[];
-extern rwlock_t slow_work_execs_lock;
-#endif
-
-extern struct list_head slow_work_queue;
-extern struct list_head vslow_work_queue;
-extern spinlock_t slow_work_queue_lock;
-
-/*
- * slow-work-debugfs.c
- */
-#ifdef CONFIG_SLOW_WORK_DEBUG
-extern const struct file_operations slow_work_runqueue_fops;
-
-extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-#endif
-
-/*
- * Helper functions
- */
-static inline void slow_work_set_thread_pid(int id, pid_t pid)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- slow_work_pids[id] = pid;
-#endif
-}
-
-static inline void slow_work_mark_time(struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- work->mark = CURRENT_TIME;
-#endif
-}
-
-static inline void slow_work_begin_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- slow_work_execs[id] = work;
-#endif
-}
-
-static inline void slow_work_end_exec(int id, struct slow_work *work)
-{
-#ifdef CONFIG_SLOW_WORK_DEBUG
- write_lock(&slow_work_execs_lock);
- slow_work_execs[id] = NULL;
- write_unlock(&slow_work_execs_lock);
-#endif
-}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..4372ccb25127 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,9 +35,9 @@ struct cpu_stop_done {
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
spinlock_t lock;
+ bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
struct task_struct *thread; /* stopper thread */
- bool enabled; /* is this stopper enabled? */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..e9ad44489828 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1236,15 +1236,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
- else {
- struct rlimit value;
- task_lock(current->group_leader);
- value = current->signal->rlim[resource];
- task_unlock(current->group_leader);
- return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
- }
+ struct rlimit value;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &value);
+ if (!ret)
+ ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0;
+
+ return ret;
}
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
@@ -1272,44 +1271,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
#endif
-SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+static inline bool rlim64_is_infinity(__u64 rlim64)
{
- struct rlimit new_rlim, *old_rlim;
- int retval;
+#if BITS_PER_LONG < 64
+ return rlim64 >= ULONG_MAX;
+#else
+ return rlim64 == RLIM64_INFINITY;
+#endif
+}
+
+static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64)
+{
+ if (rlim->rlim_cur == RLIM_INFINITY)
+ rlim64->rlim_cur = RLIM64_INFINITY;
+ else
+ rlim64->rlim_cur = rlim->rlim_cur;
+ if (rlim->rlim_max == RLIM_INFINITY)
+ rlim64->rlim_max = RLIM64_INFINITY;
+ else
+ rlim64->rlim_max = rlim->rlim_max;
+}
+
+static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
+{
+ if (rlim64_is_infinity(rlim64->rlim_cur))
+ rlim->rlim_cur = RLIM_INFINITY;
+ else
+ rlim->rlim_cur = (unsigned long)rlim64->rlim_cur;
+ if (rlim64_is_infinity(rlim64->rlim_max))
+ rlim->rlim_max = RLIM_INFINITY;
+ else
+ rlim->rlim_max = (unsigned long)rlim64->rlim_max;
+}
+
+/* make sure you are allowed to change @tsk limits before calling this */
+int do_prlimit(struct task_struct *tsk, unsigned int resource,
+ struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+ struct rlimit *rlim;
+ int retval = 0;
if (resource >= RLIM_NLIMITS)
return -EINVAL;
- if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
- return -EFAULT;
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
- return -EINVAL;
- old_rlim = current->signal->rlim + resource;
- if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
- !capable(CAP_SYS_RESOURCE))
- return -EPERM;
- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
- return -EPERM;
-
- retval = security_task_setrlimit(resource, &new_rlim);
- if (retval)
- return retval;
-
- if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- new_rlim.rlim_cur = 1;
+ if (new_rlim) {
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
+ return -EINVAL;
+ if (resource == RLIMIT_NOFILE &&
+ new_rlim->rlim_max > sysctl_nr_open)
+ return -EPERM;
}
- task_lock(current->group_leader);
- *old_rlim = new_rlim;
- task_unlock(current->group_leader);
-
- if (resource != RLIMIT_CPU)
+ /* protect tsk->signal and tsk->sighand from disappearing */
+ read_lock(&tasklist_lock);
+ if (!tsk->sighand) {
+ retval = -ESRCH;
goto out;
+ }
+
+ rlim = tsk->signal->rlim + resource;
+ task_lock(tsk->group_leader);
+ if (new_rlim) {
+ if (new_rlim->rlim_max > rlim->rlim_max &&
+ !capable(CAP_SYS_RESOURCE))
+ retval = -EPERM;
+ if (!retval)
+ retval = security_task_setrlimit(tsk->group_leader,
+ resource, new_rlim);
+ if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim->rlim_cur = 1;
+ }
+ }
+ if (!retval) {
+ if (old_rlim)
+ *old_rlim = *rlim;
+ if (new_rlim)
+ *rlim = *new_rlim;
+ }
+ task_unlock(tsk->group_leader);
/*
* RLIMIT_CPU handling. Note that the kernel fails to return an error
@@ -1317,14 +1361,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
* very long-standing error, and fixing it now risks breakage of
* applications, so we live with it
*/
- if (new_rlim.rlim_cur == RLIM_INFINITY)
- goto out;
-
- update_rlimit_cpu(new_rlim.rlim_cur);
+ if (!retval && new_rlim && resource == RLIMIT_CPU &&
+ new_rlim->rlim_cur != RLIM_INFINITY)
+ update_rlimit_cpu(tsk, new_rlim->rlim_cur);
out:
+ read_unlock(&tasklist_lock);
+ return retval;
+}
+
+/* rcu lock must be held */
+static int check_prlimit_permission(struct task_struct *task)
+{
+ const struct cred *cred = current_cred(), *tcred;
+
+ tcred = __task_cred(task);
+ if ((cred->uid != tcred->euid ||
+ cred->uid != tcred->suid ||
+ cred->uid != tcred->uid ||
+ cred->gid != tcred->egid ||
+ cred->gid != tcred->sgid ||
+ cred->gid != tcred->gid) &&
+ !capable(CAP_SYS_RESOURCE)) {
+ return -EPERM;
+ }
+
return 0;
}
+SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
+ const struct rlimit64 __user *, new_rlim,
+ struct rlimit64 __user *, old_rlim)
+{
+ struct rlimit64 old64, new64;
+ struct rlimit old, new;
+ struct task_struct *tsk;
+ int ret;
+
+ if (new_rlim) {
+ if (copy_from_user(&new64, new_rlim, sizeof(new64)))
+ return -EFAULT;
+ rlim64_to_rlim(&new64, &new);
+ }
+
+ rcu_read_lock();
+ tsk = pid ? find_task_by_vpid(pid) : current;
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ ret = check_prlimit_permission(tsk);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+
+ if (!ret && old_rlim) {
+ rlim_to_rlim64(&old, &old64);
+ if (copy_to_user(old_rlim, &old64, sizeof(old64)))
+ ret = -EFAULT;
+ }
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
+{
+ struct rlimit new_rlim;
+
+ if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
+ return -EFAULT;
+ return do_prlimit(current, resource, &new_rlim, NULL);
+}
+
/*
* It would make sense to put struct rusage in the task_struct,
* except that would make the task_struct be *really big*. After
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 70f2ea758ffe..bad369ec5403 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_event_open);
+
+/* fanotify! */
+cond_syscall(sys_fanotify_init);
+cond_syscall(sys_fanotify_mark);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6f79c7f81c96..ca38e8e3e907 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,16 +44,17 @@
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/dcache.h>
+#include <linux/dnotify.h>
#include <linux/syscalls.h>
#include <linux/vmstat.h>
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
-#include <linux/slow-work.h>
#include <linux/perf_event.h>
#include <linux/kprobes.h>
#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -86,9 +87,6 @@
/* External variables not in a header file. */
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
-extern int sysctl_panic_on_oom;
-extern int sysctl_oom_kill_allocating_task;
-extern int sysctl_oom_dump_tasks;
extern int max_threads;
extern int core_uses_pid;
extern int suid_dumpable;
@@ -134,6 +132,9 @@ static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
+#ifdef CONFIG_INOTIFY_USER
+#include <linux/inotify.h>
+#endif
#ifdef CONFIG_SPARC
#include <asm/system.h>
#endif
@@ -210,9 +211,6 @@ static struct ctl_table fs_table[];
static struct ctl_table debug_table[];
static struct ctl_table dev_table[];
extern struct ctl_table random_table[];
-#ifdef CONFIG_INOTIFY_USER
-extern struct ctl_table inotify_table[];
-#endif
#ifdef CONFIG_EPOLL
extern struct ctl_table epoll_table[];
#endif
@@ -566,7 +564,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#ifdef CONFIG_HOTPLUG
{
.procname = "hotplug",
.data = &uevent_helper,
@@ -917,13 +915,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_SLOW_WORK
- {
- .procname = "slow-work",
- .mode = 0555,
- .child = slow_work_sysctls,
- },
-#endif
#ifdef CONFIG_PERF_EVENTS
{
.procname = "perf_event_paranoid",
diff --git a/kernel/time.c b/kernel/time.c
index 848b1c2ab09a..ba9b338d1835 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-#ifndef CONFIG_GENERIC_TIME
-/*
- * Simulate gettimeofday using do_gettimeofday which only allows a timeval
- * and therefore only yields usec accuracy
- */
-void getnstimeofday(struct timespec *tv)
-{
- struct timeval x;
-
- do_gettimeofday(&x);
- tv->tv_sec = x.tv_sec;
- tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
-}
-EXPORT_SYMBOL_GPL(getnstimeofday);
-#endif
-
/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 95ed42951e0a..f06a8a365648 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -6,7 +6,7 @@ config TICK_ONESHOT
config NO_HZ
bool "Tickless System (Dynamic Ticks)"
- depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables a tickless system: timer interrupts will
@@ -15,7 +15,7 @@ config NO_HZ
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
- depends on GENERIC_TIME && GENERIC_CLOCKEVENTS
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
help
This option enables high resolution timer support. If your
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..c18d7efa1b4b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
return max_nsecs - (max_nsecs >> 5);
}
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
/**
* clocksource_select - Select the best clocksource available
@@ -577,7 +577,7 @@ static void clocksource_select(void)
}
}
-#else /* CONFIG_GENERIC_TIME */
+#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
static inline void clocksource_select(void) { }
@@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs)
#define MAX_UPDATE_LENGTH 5 /* Seconds */
/**
- * __clocksource_register_scale - Used to install new clocksources
+ * __clocksource_updatefreq_scale - Used update clocksource with new freq
* @t: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
*
- * Returns -EBUSY if registration fails, zero otherwise.
+ * This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_register_hz() or clocksource_register_khz helper functions.
+ * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
*/
-int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
-
/*
* Ideally we want to use some of the limits used in
* clocksource_max_deferment, to provide a more informed
@@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
NSEC_PER_SEC/scale,
MAX_UPDATE_LENGTH*scale);
cs->max_idle_ns = clocksource_max_deferment(cs);
+}
+EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+
+/**
+ * __clocksource_register_scale - Used to install new clocksources
+ * @t: clocksource to be registered
+ * @scale: Scale factor multiplied against freq to get clocksource hz
+ * @freq: clocksource frequency (cycles per second) divided by scale
+ *
+ * Returns -EBUSY if registration fails, zero otherwise.
+ *
+ * This *SHOULD NOT* be called directly! Please use the
+ * clocksource_register_hz() or clocksource_register_khz helper functions.
+ */
+int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
+{
+
+ /* Intialize mult/shift and max_idle_ns */
+ __clocksource_updatefreq_scale(cs, scale, freq);
+ /* Add clocksource to the clcoksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
clocksource_select();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* Setup the next period for devices, which do not have
* periodic mode. We read dev->next_event first and add to it
- * when the event alrady expired. clockevents_program_event()
+ * when the event already expired. clockevents_program_event()
* sets dev->next_event only when the event is really
* programmed to the device.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..3e216e01bbd1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
} while (read_seqretry(&xtime_lock, seq));
if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
- arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+ arch_needs_cpu(cpu)) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
- if (select_nohz_load_balancer(1)) {
- /*
- * sched tick not stopped!
- */
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
- goto out;
- }
+ select_nohz_load_balancer(1);
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -780,7 +774,6 @@ void tick_setup_sched_timer(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now = ktime_get();
- u64 offset;
/*
* Emulate tick processing via per-CPU hrtimers:
@@ -790,10 +783,6 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
- offset = ktime_to_ns(tick_period) >> 1;
- do_div(offset, num_possible_cpus());
- offset *= smp_processor_id();
- hrtimer_add_expires_ns(&ts->sched_timer, offset);
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index caf8d4d4f5c8..49010d822f72 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
* - wall_to_monotonic is no longer the boot time, getboottime must be
* used instead.
*/
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+static struct timespec xtime __attribute__ ((aligned (16)));
+static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static struct timespec total_sleep_time;
/*
@@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond)
{
xtime.tv_sec += leapsecond;
wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
}
-#ifdef CONFIG_GENERIC_TIME
-
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv)
timekeeper.ntp_error = 0;
ntp_clear();
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock)
tick_clock_notify();
}
-#else /* GENERIC_TIME */
-
-static inline void timekeeping_forward_now(void) { }
-
-/**
- * ktime_get - get the monotonic time in ktime_t format
- *
- * returns the time in ktime_t format
- */
-ktime_t ktime_get(void)
-{
- struct timespec now;
-
- ktime_get_ts(&now);
-
- return timespec_to_ktime(now);
-}
-EXPORT_SYMBOL_GPL(ktime_get);
-
-/**
- * ktime_get_ts - get the monotonic clock in timespec format
- * @ts: pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime
- * clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
- */
-void ktime_get_ts(struct timespec *ts)
-{
- struct timespec tomono;
- unsigned long seq;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- getnstimeofday(ts);
- tomono = wall_to_monotonic;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(ktime_get_ts);
-
-#endif /* !GENERIC_TIME */
-
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
*
@@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev)
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
- xtime = timespec_add_safe(xtime, ts);
+ xtime = timespec_add(xtime, ts);
wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
- total_sleep_time = timespec_add_safe(total_sleep_time, ts);
+ total_sleep_time = timespec_add(total_sleep_time, ts);
}
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset)
static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
{
u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+ u64 raw_nsecs;
/* If the offset is smaller then a shifted interval, do nothing */
if (offset < timekeeper.cycle_interval<<shift)
@@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
second_overflow();
}
- /* Accumulate into raw time */
- raw_time.tv_nsec += timekeeper.raw_interval << shift;;
- while (raw_time.tv_nsec >= NSEC_PER_SEC) {
- raw_time.tv_nsec -= NSEC_PER_SEC;
- raw_time.tv_sec++;
+ /* Accumulate raw time */
+ raw_nsecs = timekeeper.raw_interval << shift;
+ raw_nsecs += raw_time.tv_nsec;
+ if (raw_nsecs >= NSEC_PER_SEC) {
+ u64 raw_secs = raw_nsecs;
+ raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+ raw_time.tv_sec += raw_secs;
}
+ raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
timekeeper.ntp_error += tick_length << shift;
@@ -784,10 +742,11 @@ void update_wall_time(void)
return;
clock = timekeeper.clock;
-#ifdef CONFIG_GENERIC_TIME
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
-#else
+
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = timekeeper.cycle_interval;
+#else
+ offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
@@ -856,7 +815,8 @@ void update_wall_time(void)
}
/* check to see if there is a new clocksource to use */
- update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
}
/**
@@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime);
*/
void monotonic_to_bootbased(struct timespec *ts)
{
- *ts = timespec_add_safe(*ts, total_sleep_time);
+ *ts = timespec_add(*ts, total_sleep_time);
}
EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
@@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void)
return xtime;
}
+struct timespec __get_wall_to_monotonic(void)
+{
+ return wall_to_monotonic;
+}
+
struct timespec current_kernel_time(void)
{
struct timespec now;
diff --git a/kernel/timer.c b/kernel/timer.c
index c29e2d4d2a66..97bf05baade7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/*
* Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB for
- * the new flag to indicate whether the timer is deferrable
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
*/
#define TBASE_DEFERRABLE_FLAG (0x1)
@@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
/**
* set_timer_slack - set the allowed slack for a timer
+ * @timer: the timer to be modified
* @slack_hz: the amount of time (in jiffies) allowed for rounding
*
* Set the amount of time, in jiffies, that a certain timer has
@@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer,
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
+void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
+ const char *name,
+ struct lock_class_key *key,
+ void (*function)(unsigned long),
+ unsigned long data)
+{
+ timer->function = function;
+ timer->data = data;
+ init_timer_on_stack_key(timer, name, key);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
+
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
@@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
cpu = smp_processor_id();
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
- if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
- int preferred_cpu = get_nohz_load_balancer();
-
- if (preferred_cpu >= 0)
- cpu = preferred_cpu;
- }
+ if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
+ cpu = get_nohz_timer_target();
#endif
new_base = per_cpu(tvec_bases, cpu);
@@ -1749,3 +1764,25 @@ unsigned long msleep_interruptible(unsigned int msecs)
}
EXPORT_SYMBOL(msleep_interruptible);
+
+static int __sched do_usleep_range(unsigned long min, unsigned long max)
+{
+ ktime_t kmin;
+ unsigned long delta;
+
+ kmin = ktime_set(0, min * NSEC_PER_USEC);
+ delta = (max - min) * NSEC_PER_USEC;
+ return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+}
+
+/**
+ * usleep_range - Drop in replacement for udelay where wakeup is flexible
+ * @min: Minimum time in usecs to sleep
+ * @max: Maximum time in usecs to sleep
+ */
+void usleep_range(unsigned long min, unsigned long max)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ do_usleep_range(min, max);
+}
+EXPORT_SYMBOL(usleep_range);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index c7683fd8a03a..538501c6ea50 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -153,7 +153,7 @@ config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
select TRACE_IRQFLAGS
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -175,7 +175,7 @@ config IRQSOFF_TRACER
config PREEMPT_TRACER
bool "Preemption-off Latency Tracer"
default n
- depends on GENERIC_TIME
+ depends on !ARCH_USES_GETTIMEOFFSET
depends on PREEMPT
select GENERIC_TRACER
select TRACER_MAX_TRACE
@@ -323,17 +323,6 @@ config STACK_TRACER
Say N if unsure.
-config WORKQUEUE_TRACER
- bool "Trace workqueues"
- select GENERIC_TRACER
- help
- The workqueue tracer provides some statistical information
- about each cpu workqueue thread such as the number of the
- works inserted and executed since their creation. It can help
- to evaluate the amount of work each of them has to perform.
- For example it can help a developer to decide whether he should
- choose a per-cpu workqueue instead of a singlethreaded one.
-
config BLK_DEV_IO_TRACE
bool "Support for tracing block IO actions"
depends on SYSFS
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 438e84a56ab3..53f338190b26 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,5 +53,8 @@ endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+ifeq ($(CONFIG_TRACING),y)
+obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
+endif
libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 638711c17504..959f8d6c8cc1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
+#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
+#define BLK_TC_RAHEAD BLK_TC_AHEAD
+
/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
- (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
+ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
/*
* The worker for the various blk_add_trace*() types. Fills out a
@@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
return;
what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, BARRIER);
- what |= MASK_TC_BIT(rw, SYNCIO);
- what |= MASK_TC_BIT(rw, AHEAD);
+ what |= MASK_TC_BIT(rw, HARDBARRIER);
+ what |= MASK_TC_BIT(rw, SYNC);
+ what |= MASK_TC_BIT(rw, RAHEAD);
what |= MASK_TC_BIT(rw, META);
what |= MASK_TC_BIT(rw, DISCARD);
@@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
EXPORT_SYMBOL_GPL(blk_trace_setup);
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+static int compat_blk_trace_setup(struct request_queue *q, char *name,
+ dev_t dev, struct block_device *bdev,
+ char __user *arg)
+{
+ struct blk_user_trace_setup buts;
+ struct compat_blk_user_trace_setup cbuts;
+ int ret;
+
+ if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
+ return -EFAULT;
+
+ buts = (struct blk_user_trace_setup) {
+ .act_mask = cbuts.act_mask,
+ .buf_size = cbuts.buf_size,
+ .buf_nr = cbuts.buf_nr,
+ .start_lba = cbuts.start_lba,
+ .end_lba = cbuts.end_lba,
+ .pid = cbuts.pid,
+ };
+ memcpy(&buts.name, &cbuts.name, 32);
+
+ ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, &buts.name, 32)) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+#endif
+
int blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
@@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
+ lock_kernel();
mutex_lock(&bdev->bd_mutex);
switch (cmd) {
@@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
bdevname(bdev, b);
ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
+#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
+ case BLKTRACESETUP32:
+ bdevname(bdev, b);
+ ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ break;
+#endif
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
@@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
}
mutex_unlock(&bdev->bd_mutex);
+ unlock_kernel();
return ret;
}
@@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (likely(!bt))
return;
- if (blk_discard_rq(rq))
- rw |= (1 << BIO_RW_DISCARD);
+ if (rq->cmd_flags & REQ_DISCARD)
+ rw |= REQ_DISCARD;
+
+ if (rq->cmd_flags & REQ_SECURE)
+ rw |= REQ_SECURE;
- if (blk_pc_request(rq)) {
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
what, rq->errors, rq->cmd_len, rq->cmd);
@@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q,
if (likely(!bt))
return;
- if (blk_pc_request(rq))
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
else
@@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq)
int len = rq->cmd_len;
unsigned char *cmd = rq->cmd;
- if (!blk_pc_request(rq)) {
+ if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
buf[0] = '\0';
return;
}
@@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
if (rw & WRITE)
rwbs[i++] = 'W';
- else if (rw & 1 << BIO_RW_DISCARD)
+ else if (rw & REQ_DISCARD)
rwbs[i++] = 'D';
else if (bytes)
rwbs[i++] = 'R';
else
rwbs[i++] = 'N';
- if (rw & 1 << BIO_RW_AHEAD)
+ if (rw & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (rw & 1 << BIO_RW_BARRIER)
+ if (rw & REQ_HARDBARRIER)
rwbs[i++] = 'B';
- if (rw & 1 << BIO_RW_SYNCIO)
+ if (rw & REQ_SYNC)
rwbs[i++] = 'S';
- if (rw & 1 << BIO_RW_META)
+ if (rw & REQ_META)
rwbs[i++] = 'M';
+ if (rw & REQ_SECURE)
+ rwbs[i++] = 'E';
rwbs[i] = '\0';
}
@@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
int rw = rq->cmd_flags & 0x03;
int bytes;
- if (blk_discard_rq(rq))
- rw |= (1 << BIO_RW_DISCARD);
+ if (rq->cmd_flags & REQ_DISCARD)
+ rw |= REQ_DISCARD;
+
+ if (rq->cmd_flags & REQ_SECURE)
+ rw |= REQ_SECURE;
bytes = blk_rq_bytes(rq);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3632ce87674f..19cccc3c3028 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3846,6 +3846,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
rpos = reader->read;
pos += size;
+ if (rpos >= commit)
+ break;
+
event = rb_reader_event(cpu_buffer);
size = rb_event_length(event);
} while (len > size);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4b1122d0df37..9ec59f541156 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void)
preempt_enable();
}
-static cpumask_var_t __read_mostly tracing_buffer_mask;
-
-#define for_each_tracing_cpu(cpu) \
- for_each_cpu(cpu, tracing_buffer_mask)
+cpumask_var_t __read_mostly tracing_buffer_mask;
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -744,13 +741,6 @@ __acquires(kernel_lock)
return -1;
}
- /*
- * When this gets called we hold the BKL which means that
- * preemption is disabled. Various trace selftests however
- * need to disable and enable preemption for successful tests.
- * So we drop the BKL here and grab it after the tests again.
- */
- unlock_kernel();
mutex_lock(&trace_types_lock);
tracing_selftest_running = true;
@@ -832,7 +822,6 @@ __acquires(kernel_lock)
#endif
out_unlock:
- lock_kernel();
return ret;
}
@@ -1493,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
}
EXPORT_SYMBOL_GPL(trace_vprintk);
-enum trace_file_type {
- TRACE_FILE_LAT_FMT = 1,
- TRACE_FILE_ANNOTATE = 2,
-};
-
static void trace_iterator_increment(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
@@ -1595,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
}
/* Find the next real entry, and increment the iterator to the next entry */
-static void *find_next_entry_inc(struct trace_iterator *iter)
+void *trace_find_next_entry_inc(struct trace_iterator *iter)
{
iter->ent = __find_next_entry(iter, &iter->cpu,
&iter->lost_events, &iter->ts);
@@ -1630,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
return NULL;
if (iter->idx < 0)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
else
ent = iter;
while (ent && iter->idx < i)
- ent = find_next_entry_inc(iter);
+ ent = trace_find_next_entry_inc(iter);
iter->pos = *pos;
return ent;
}
-static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
+void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
struct trace_array *tr = iter->tr;
struct ring_buffer_event *event;
@@ -2003,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter)
}
/* Called with trace_event_read_lock() held. */
-static enum print_line_t print_trace_line(struct trace_iterator *iter)
+enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
@@ -3193,7 +3177,7 @@ waitagain:
trace_event_read_lock();
trace_access_lock(iter->cpu_file);
- while (find_next_entry_inc(iter) != NULL) {
+ while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
int len = iter->seq.len;
@@ -3276,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
rem -= count;
- if (!find_next_entry_inc(iter)) {
+ if (!trace_find_next_entry_inc(iter)) {
rem = 0;
iter->ent = NULL;
break;
@@ -3332,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
if (ret <= 0)
goto out_err;
- if (!iter->ent && !find_next_entry_inc(iter)) {
+ if (!iter->ent && !trace_find_next_entry_inc(iter)) {
ret = -EFAULT;
goto out_err;
}
@@ -3479,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
char *buf;
+ size_t written;
if (tracing_disabled)
return -EINVAL;
@@ -3500,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
} else
buf[cnt] = '\0';
- cnt = mark_printk("%s", buf);
+ written = mark_printk("%s", buf);
kfree(buf);
- *fpos += cnt;
+ *fpos += written;
- return cnt;
+ /* don't tell userspace we wrote more - it might confuse them */
+ if (written > cnt)
+ written = cnt;
+
+ return written;
}
static int tracing_clock_show(struct seq_file *m, void *v)
@@ -4402,7 +4391,7 @@ static struct notifier_block trace_die_notifier = {
*/
#define KERN_TRACE KERN_EMERG
-static void
+void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
@@ -4417,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
+void trace_init_global_iter(struct trace_iterator *iter)
+{
+ iter->tr = &global_trace;
+ iter->trace = current_trace;
+ iter->cpu_file = TRACE_PIPE_ALL_CPU;
+}
+
static void
__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
{
@@ -4442,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (disable_tracing)
ftrace_kill();
+ trace_init_global_iter(&iter);
+
for_each_tracing_cpu(cpu) {
- atomic_inc(&global_trace.data[cpu]->disabled);
+ atomic_inc(&iter.tr->data[cpu]->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -4492,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
- if (find_next_entry_inc(&iter) != NULL) {
+ if (trace_find_next_entry_inc(&iter) != NULL) {
int ret;
ret = print_trace_line(&iter);
@@ -4514,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&global_trace.data[cpu]->disabled);
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
tracing_on();
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d05c873dd4b2..d39b3c5454a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -314,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
+int trace_empty(struct trace_iterator *iter);
+
+void *trace_find_next_entry_inc(struct trace_iterator *iter);
+
+void trace_init_global_iter(struct trace_iterator *iter);
+
+void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
@@ -351,6 +359,15 @@ void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
int is_tracing_stopped(void);
+enum trace_file_type {
+ TRACE_FILE_LAT_FMT = 1,
+ TRACE_FILE_ANNOTATE = 2,
+};
+
+extern cpumask_var_t __read_mostly tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu) \
+ for_each_cpu(cpu, tracing_buffer_mask)
extern unsigned long nsecs_to_usecs(unsigned long nsecs);
@@ -436,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...);
+void trace_printk_seq(struct trace_seq *s);
+enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 52fda6c04ac3..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -55,7 +55,7 @@ u64 notrace trace_clock_local(void)
*/
u64 notrace trace_clock(void)
{
- return cpu_clock(raw_smp_processor_id());
+ return local_clock();
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 09b4fa6e4d3b..398c0e8b332c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -598,88 +598,146 @@ out:
return ret;
}
-static void print_event_fields(struct trace_seq *s, struct list_head *head)
+enum {
+ FORMAT_HEADER = 1,
+ FORMAT_FIELD_SEPERATOR = 2,
+ FORMAT_PRINTFMT = 3,
+};
+
+static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
+ struct ftrace_event_call *call = m->private;
struct ftrace_event_field *field;
+ struct list_head *common_head = &ftrace_common_fields;
+ struct list_head *head = trace_get_fields(call);
- list_for_each_entry_reverse(field, head, link) {
- /*
- * Smartly shows the array type(except dynamic array).
- * Normal:
- * field:TYPE VAR
- * If TYPE := TYPE[LEN], it is shown:
- * field:TYPE VAR[LEN]
- */
- const char *array_descriptor = strchr(field->type, '[');
+ (*pos)++;
- if (!strncmp(field->type, "__data_loc", 10))
- array_descriptor = NULL;
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ if (unlikely(list_empty(common_head)))
+ return NULL;
- if (!array_descriptor) {
- trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
- "\tsize:%u;\tsigned:%d;\n",
- field->type, field->name, field->offset,
- field->size, !!field->is_signed);
- } else {
- trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
- "\tsize:%u;\tsigned:%d;\n",
- (int)(array_descriptor - field->type),
- field->type, field->name,
- array_descriptor, field->offset,
- field->size, !!field->is_signed);
- }
+ field = list_entry(common_head->prev,
+ struct ftrace_event_field, link);
+ return field;
+
+ case FORMAT_FIELD_SEPERATOR:
+ if (unlikely(list_empty(head)))
+ return NULL;
+
+ field = list_entry(head->prev, struct ftrace_event_field, link);
+ return field;
+
+ case FORMAT_PRINTFMT:
+ /* all done */
+ return NULL;
}
+
+ field = v;
+ if (field->link.prev == common_head)
+ return (void *)FORMAT_FIELD_SEPERATOR;
+ else if (field->link.prev == head)
+ return (void *)FORMAT_PRINTFMT;
+
+ field = list_entry(field->link.prev, struct ftrace_event_field, link);
+
+ return field;
}
-static ssize_t
-event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+static void *f_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call = filp->private_data;
- struct list_head *head;
- struct trace_seq *s;
- char *buf;
- int r;
+ loff_t l = 0;
+ void *p;
- if (*ppos)
+ /* Start by showing the header */
+ if (!*pos)
+ return (void *)FORMAT_HEADER;
+
+ p = (void *)FORMAT_HEADER;
+ do {
+ p = f_next(m, p, &l);
+ } while (p && l < *pos);
+
+ return p;
+}
+
+static int f_show(struct seq_file *m, void *v)
+{
+ struct ftrace_event_call *call = m->private;
+ struct ftrace_event_field *field;
+ const char *array_descriptor;
+
+ switch ((unsigned long)v) {
+ case FORMAT_HEADER:
+ seq_printf(m, "name: %s\n", call->name);
+ seq_printf(m, "ID: %d\n", call->event.type);
+ seq_printf(m, "format:\n");
return 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
+ case FORMAT_FIELD_SEPERATOR:
+ seq_putc(m, '\n');
+ return 0;
- trace_seq_init(s);
+ case FORMAT_PRINTFMT:
+ seq_printf(m, "\nprint fmt: %s\n",
+ call->print_fmt);
+ return 0;
+ }
- trace_seq_printf(s, "name: %s\n", call->name);
- trace_seq_printf(s, "ID: %d\n", call->event.type);
- trace_seq_printf(s, "format:\n");
+ field = v;
- /* print common fields */
- print_event_fields(s, &ftrace_common_fields);
+ /*
+ * Smartly shows the array type(except dynamic array).
+ * Normal:
+ * field:TYPE VAR
+ * If TYPE := TYPE[LEN], it is shown:
+ * field:TYPE VAR[LEN]
+ */
+ array_descriptor = strchr(field->type, '[');
- trace_seq_putc(s, '\n');
+ if (!strncmp(field->type, "__data_loc", 10))
+ array_descriptor = NULL;
- /* print event specific fields */
- head = trace_get_fields(call);
- print_event_fields(s, head);
+ if (!array_descriptor)
+ seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ field->type, field->name, field->offset,
+ field->size, !!field->is_signed);
+ else
+ seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",
+ (int)(array_descriptor - field->type),
+ field->type, field->name,
+ array_descriptor, field->offset,
+ field->size, !!field->is_signed);
- r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt);
+ return 0;
+}
- if (!r) {
- /*
- * ug! The format output is bigger than a PAGE!!
- */
- buf = "FORMAT TOO BIG\n";
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- buf, strlen(buf));
- goto out;
- }
+static void f_stop(struct seq_file *m, void *p)
+{
+}
- r = simple_read_from_buffer(ubuf, cnt, ppos,
- s->buffer, s->len);
- out:
- kfree(s);
- return r;
+static const struct seq_operations trace_format_seq_ops = {
+ .start = f_start,
+ .next = f_next,
+ .stop = f_stop,
+ .show = f_show,
+};
+
+static int trace_format_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_event_call *call = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, &trace_format_seq_ops);
+ if (ret < 0)
+ return ret;
+
+ m = file->private_data;
+ m->private = call;
+
+ return 0;
}
static ssize_t
@@ -877,8 +935,10 @@ static const struct file_operations ftrace_enable_fops = {
};
static const struct file_operations ftrace_event_format_fops = {
- .open = tracing_open_generic,
- .read = event_format_read,
+ .open = trace_format_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
static const struct file_operations ftrace_event_id_fops = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index fcb5a542cd21..c93bcb248638 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
* if the output fails.
*/
data->ent = *curr;
- data->ret = *next;
+ /*
+ * If the next event is not a return type, then
+ * we only care about what type it is. Otherwise we can
+ * safely copy the entire event.
+ */
+ if (next->ent.type == TRACE_GRAPH_RET)
+ data->ret = *next;
+ else
+ data->ret.ent.type = next->ent.type;
}
}
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
new file mode 100644
index 000000000000..7b8ecd751d93
--- /dev/null
+++ b/kernel/trace/trace_kdb.c
@@ -0,0 +1,136 @@
+/*
+ * kdb helper for dumping the ftrace buffer
+ *
+ * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com>
+ *
+ * ftrace_dump_buf based on ftrace_dump:
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/kdb.h>
+#include <linux/ftrace.h>
+
+#include "../debug/kdb/kdb_private.h"
+#include "trace.h"
+#include "trace_output.h"
+
+static void ftrace_dump_buf(int skip_lines, long cpu_file)
+{
+ /* use static because iter can be a bit big for the stack */
+ static struct trace_iterator iter;
+ unsigned int old_userobj;
+ int cnt = 0, cpu;
+
+ trace_init_global_iter(&iter);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&iter.tr->data[cpu]->disabled);
+ }
+
+ old_userobj = trace_flags;
+
+ /* don't look at user memory in panic mode */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ kdb_printf("Dumping ftrace buffer:\n");
+
+ /* reset all but tr, trace, and overruns */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+ iter.iter_flags |= TRACE_FILE_LAT_FMT;
+ iter.pos = -1;
+
+ if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ for_each_tracing_cpu(cpu) {
+ iter.buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ ring_buffer_read_start(iter.buffer_iter[cpu]);
+ tracing_iter_reset(&iter, cpu);
+ }
+ } else {
+ iter.cpu_file = cpu_file;
+ iter.buffer_iter[cpu_file] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+ ring_buffer_read_start(iter.buffer_iter[cpu_file]);
+ tracing_iter_reset(&iter, cpu_file);
+ }
+ if (!trace_empty(&iter))
+ trace_find_next_entry_inc(&iter);
+ while (!trace_empty(&iter)) {
+ if (!cnt)
+ kdb_printf("---------------------------------\n");
+ cnt++;
+
+ if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ print_trace_line(&iter);
+ if (!skip_lines)
+ trace_printk_seq(&iter.seq);
+ else
+ skip_lines--;
+ if (KDB_FLAG(CMD_INTERRUPT))
+ goto out;
+ }
+
+ if (!cnt)
+ kdb_printf(" (ftrace buffer empty)\n");
+ else
+ kdb_printf("---------------------------------\n");
+
+out:
+ trace_flags = old_userobj;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
+ }
+
+ for_each_tracing_cpu(cpu)
+ if (iter.buffer_iter[cpu])
+ ring_buffer_read_finish(iter.buffer_iter[cpu]);
+}
+
+/*
+ * kdb_ftdump - Dump the ftrace log buffer
+ */
+static int kdb_ftdump(int argc, const char **argv)
+{
+ int skip_lines = 0;
+ long cpu_file;
+ char *cp;
+
+ if (argc > 2)
+ return KDB_ARGCOUNT;
+
+ if (argc) {
+ skip_lines = simple_strtol(argv[1], &cp, 0);
+ if (*cp)
+ skip_lines = 0;
+ }
+
+ if (argc == 2) {
+ cpu_file = simple_strtol(argv[2], &cp, 0);
+ if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 ||
+ !cpu_online(cpu_file))
+ return KDB_BADINT;
+ } else {
+ cpu_file = TRACE_PIPE_ALL_CPU;
+ }
+
+ kdb_trap_printk++;
+ ftrace_dump_buf(skip_lines, cpu_file);
+ kdb_trap_printk--;
+
+ return 0;
+}
+
+static __init int kdb_ftrace_register(void)
+{
+ kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
+ "Dump ftrace log", 0, KDB_REPEAT_NONE);
+ return 0;
+}
+
+late_initcall(kdb_ftrace_register);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/highuid.h>
#include <linux/cred.h>
/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
schedule_work(&ns->destroyer);
}
EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return uid;
+
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (uid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return gid;
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (gid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowgid;
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 327d2deb4451..2994a0e3a61c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,41 +33,287 @@
#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/workqueue.h>
+#include <linux/idr.h>
+
+#include "workqueue_sched.h"
+
+enum {
+ /* global_cwq flags */
+ GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
+ GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */
+ GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
+ GCWQ_FREEZING = 1 << 3, /* freeze in progress */
+ GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */
+
+ /* worker flags */
+ WORKER_STARTED = 1 << 0, /* started */
+ WORKER_DIE = 1 << 1, /* die die die */
+ WORKER_IDLE = 1 << 2, /* is idle */
+ WORKER_PREP = 1 << 3, /* preparing to run works */
+ WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
+ WORKER_REBIND = 1 << 5, /* mom is home, come back */
+ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
+ WORKER_UNBOUND = 1 << 7, /* worker is unbound */
+
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+ WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+
+ /* gcwq->trustee_state */
+ TRUSTEE_START = 0, /* start */
+ TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
+ TRUSTEE_BUTCHER = 2, /* butcher workers */
+ TRUSTEE_RELEASE = 3, /* release workers */
+ TRUSTEE_DONE = 4, /* trustee is done */
+
+ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
+ BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
+ BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
+
+ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
+ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
+
+ MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
+ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
+ CREATE_COOLDOWN = HZ, /* time to breath after fail */
+ TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
+
+ /*
+ * Rescue workers are used only on emergencies and shared by
+ * all cpus. Give -20.
+ */
+ RESCUER_NICE_LEVEL = -20,
+};
/*
- * The per-CPU workqueue (if single thread, we always use the first
- * possible cpu).
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Set during initialization and read-only afterwards.
+ *
+ * P: Preemption protected. Disabling preemption is enough and should
+ * only be modified and accessed from the local cpu.
+ *
+ * L: gcwq->lock protected. Access with gcwq->lock held.
+ *
+ * X: During normal operation, modification requires gcwq->lock and
+ * should be done only from local cpu. Either disabling preemption
+ * on local cpu or grabbing gcwq->lock is enough for read access.
+ * If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *
+ * F: wq->flush_mutex protected.
+ *
+ * W: workqueue_lock protected.
*/
-struct cpu_workqueue_struct {
- spinlock_t lock;
+struct global_cwq;
- struct list_head worklist;
- wait_queue_head_t more_work;
- struct work_struct *current_work;
+/*
+ * The poor guys doing the actual heavy lifting. All on-duty workers
+ * are either serving the manager role, on idle list or on busy hash.
+ */
+struct worker {
+ /* on idle list while idle, on busy hash table while busy */
+ union {
+ struct list_head entry; /* L: while idle */
+ struct hlist_node hentry; /* L: while busy */
+ };
- struct workqueue_struct *wq;
- struct task_struct *thread;
-} ____cacheline_aligned;
+ struct work_struct *current_work; /* L: work being processed */
+ struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
+ struct list_head scheduled; /* L: scheduled works */
+ struct task_struct *task; /* I: worker task */
+ struct global_cwq *gcwq; /* I: the associated gcwq */
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+ unsigned long last_active; /* L: last active timestamp */
+ unsigned int flags; /* X: flags */
+ int id; /* I: worker id */
+ struct work_struct rebind_work; /* L: rebind worker to cpu */
+};
+
+/*
+ * Global per-cpu workqueue. There's one and only one for each cpu
+ * and all works are queued and processed here regardless of their
+ * target workqueues.
+ */
+struct global_cwq {
+ spinlock_t lock; /* the gcwq lock */
+ struct list_head worklist; /* L: list of pending works */
+ unsigned int cpu; /* I: the associated cpu */
+ unsigned int flags; /* L: GCWQ_* flags */
+
+ int nr_workers; /* L: total number of workers */
+ int nr_idle; /* L: currently idle ones */
+
+ /* workers are chained either in the idle_list or busy_hash */
+ struct list_head idle_list; /* X: list of idle workers */
+ struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
+ /* L: hash of busy workers */
+
+ struct timer_list idle_timer; /* L: worker idle timeout */
+ struct timer_list mayday_timer; /* L: SOS timer for dworkers */
+
+ struct ida worker_ida; /* L: for worker IDs */
+
+ struct task_struct *trustee; /* L: for gcwq shutdown */
+ unsigned int trustee_state; /* L: trustee state */
+ wait_queue_head_t trustee_wait; /* trustee wait */
+ struct worker *first_idle; /* L: first idle worker */
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of
+ * work_struct->data are used for flags and thus cwqs need to be
+ * aligned at two's power of the number of flag bits.
+ */
+struct cpu_workqueue_struct {
+ struct global_cwq *gcwq; /* I: the associated gcwq */
+ struct workqueue_struct *wq; /* I: the owning workqueue */
+ int work_color; /* L: current color */
+ int flush_color; /* L: flushing color */
+ int nr_in_flight[WORK_NR_COLORS];
+ /* L: nr of in_flight works */
+ int nr_active; /* L: nr of active works */
+ int max_active; /* L: max active works */
+ struct list_head delayed_works; /* L: delayed works */
+};
+
+/*
+ * Structure used to wait for workqueue flush.
+ */
+struct wq_flusher {
+ struct list_head list; /* F: list of flushers */
+ int flush_color; /* F: flush color waiting for */
+ struct completion done; /* flush completion */
+};
+
+/*
+ * All cpumasks are assumed to be always set on UP and thus can't be
+ * used to determine whether there's something to be done.
+ */
+#ifdef CONFIG_SMP
+typedef cpumask_var_t mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask) \
+ cpumask_test_and_set_cpu((cpu), (mask))
+#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
+#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
+#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp))
+#define free_mayday_mask(mask) free_cpumask_var((mask))
+#else
+typedef unsigned long mayday_mask_t;
+#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
+#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
+#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
+#define alloc_mayday_mask(maskp, gfp) true
+#define free_mayday_mask(mask) do { } while (0)
+#endif
/*
* The externally visible workqueue abstraction is an array of
* per-CPU workqueues:
*/
struct workqueue_struct {
- struct cpu_workqueue_struct *cpu_wq;
- struct list_head list;
- const char *name;
- int singlethread;
- int freezeable; /* Freeze threads during suspend */
- int rt;
+ unsigned int flags; /* I: WQ_* flags */
+ union {
+ struct cpu_workqueue_struct __percpu *pcpu;
+ struct cpu_workqueue_struct *single;
+ unsigned long v;
+ } cpu_wq; /* I: cwq's */
+ struct list_head list; /* W: list of all workqueues */
+
+ struct mutex flush_mutex; /* protects wq flushing */
+ int work_color; /* F: current work color */
+ int flush_color; /* F: current flush color */
+ atomic_t nr_cwqs_to_flush; /* flush in progress */
+ struct wq_flusher *first_flusher; /* F: first flusher */
+ struct list_head flusher_queue; /* F: flush waiters */
+ struct list_head flusher_overflow; /* F: flush overflow list */
+
+ mayday_mask_t mayday_mask; /* cpus requesting rescue */
+ struct worker *rescuer; /* I: rescue worker */
+
+ int saved_max_active; /* W: saved cwq max_active */
+ const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
- struct lockdep_map lockdep_map;
+ struct lockdep_map lockdep_map;
#endif
};
+struct workqueue_struct *system_wq __read_mostly;
+struct workqueue_struct *system_long_wq __read_mostly;
+struct workqueue_struct *system_nrt_wq __read_mostly;
+struct workqueue_struct *system_unbound_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL_GPL(system_long_wq);
+EXPORT_SYMBOL_GPL(system_nrt_wq);
+EXPORT_SYMBOL_GPL(system_unbound_wq);
+
+#define for_each_busy_worker(worker, i, pos, gcwq) \
+ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
+ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
+
+static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+ unsigned int sw)
+{
+ if (cpu < nr_cpu_ids) {
+ if (sw & 1) {
+ cpu = cpumask_next(cpu, mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ if (sw & 2)
+ return WORK_CPU_UNBOUND;
+ }
+ return WORK_CPU_NONE;
+}
+
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+ struct workqueue_struct *wq)
+{
+ return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+}
+
+/*
+ * CPU iterators
+ *
+ * An extra gcwq is defined for an invalid cpu number
+ * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
+ * specific CPU. The following iterators are similar to
+ * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ *
+ * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND
+ * for_each_cwq_cpu() : possible CPUs for bound workqueues,
+ * WORK_CPU_UNBOUND for unbound workqueues
+ */
+#define for_each_gcwq_cpu(cpu) \
+ for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+
+#define for_each_online_gcwq_cpu(cpu) \
+ for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+
+#define for_each_cwq_cpu(cpu, wq) \
+ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \
+ (cpu) < WORK_CPU_NONE; \
+ (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+
+#ifdef CONFIG_LOCKDEP
+/**
+ * in_workqueue_context() - in context of specified workqueue?
+ * @wq: the workqueue of interest
+ *
+ * Checks lockdep state to see if the current task is executing from
+ * within a workqueue item. This function exists only if lockdep is
+ * enabled.
+ */
+int in_workqueue_context(struct workqueue_struct *wq)
+{
+ return lock_is_held(&wq->lockdep_map);
+}
+#endif
+
#ifdef CONFIG_DEBUG_OBJECTS_WORK
static struct debug_obj_descr work_debug_descr;
@@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
* statically initialized. We just make sure that it
* is tracked in the object tracker.
*/
- if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
+ if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
debug_object_init(work, &work_debug_descr);
debug_object_activate(work, &work_debug_descr);
return 0;
@@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { }
/* Serializes the accesses to the list of workqueues. */
static DEFINE_SPINLOCK(workqueue_lock);
static LIST_HEAD(workqueues);
+static bool workqueue_freezing; /* W: have wqs started freezing? */
+
+/*
+ * The almighty global cpu workqueues. nr_running is the only field
+ * which is expected to be used frequently by other cpus via
+ * try_to_wake_up(). Put it in a separate cacheline.
+ */
+static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
-static int singlethread_cpu __read_mostly;
-static const struct cpumask *cpu_singlethread_map __read_mostly;
/*
- * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
- * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
- * which comes in between can't use for_each_online_cpu(). We could
- * use cpu_possible_map, the cpumask below is more a documentation
- * than optimization.
+ * Global cpu workqueue and nr_running counter for unbound gcwq. The
+ * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+ * workers have WORKER_UNBOUND set.
*/
-static cpumask_var_t cpu_populated_map __read_mostly;
+static struct global_cwq unbound_global_cwq;
+static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */
-/* If it's single threaded, it isn't in the list of workqueues. */
-static inline int is_wq_single_threaded(struct workqueue_struct *wq)
+static int worker_thread(void *__worker);
+
+static struct global_cwq *get_gcwq(unsigned int cpu)
{
- return wq->singlethread;
+ if (cpu != WORK_CPU_UNBOUND)
+ return &per_cpu(global_cwq, cpu);
+ else
+ return &unbound_global_cwq;
}
-static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
+static atomic_t *get_gcwq_nr_running(unsigned int cpu)
{
- return is_wq_single_threaded(wq)
- ? cpu_singlethread_map : cpu_populated_map;
+ if (cpu != WORK_CPU_UNBOUND)
+ return &per_cpu(gcwq_nr_running, cpu);
+ else
+ return &unbound_gcwq_nr_running;
}
-static
-struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+ struct workqueue_struct *wq)
{
- if (unlikely(is_wq_single_threaded(wq)))
- cpu = singlethread_cpu;
- return per_cpu_ptr(wq->cpu_wq, cpu);
+ if (!(wq->flags & WQ_UNBOUND)) {
+ if (likely(cpu < nr_cpu_ids)) {
+#ifdef CONFIG_SMP
+ return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+#else
+ return wq->cpu_wq.single;
+#endif
+ }
+ } else if (likely(cpu == WORK_CPU_UNBOUND))
+ return wq->cpu_wq.single;
+ return NULL;
+}
+
+static unsigned int work_color_to_flags(int color)
+{
+ return color << WORK_STRUCT_COLOR_SHIFT;
+}
+
+static int get_work_color(struct work_struct *work)
+{
+ return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
+ ((1 << WORK_STRUCT_COLOR_BITS) - 1);
+}
+
+static int work_next_color(int color)
+{
+ return (color + 1) % WORK_NR_COLORS;
}
/*
- * Set the workqueue on which a work item is to be run
- * - Must *only* be called if the pending flag is set
+ * A work's data points to the cwq with WORK_STRUCT_CWQ set while the
+ * work is on queue. Once execution starts, WORK_STRUCT_CWQ is
+ * cleared and the work data contains the cpu number it was last on.
+ *
+ * set_work_{cwq|cpu}() and clear_work_data() can be used to set the
+ * cwq, cpu or clear work->data. These functions should only be
+ * called while the work is owned - ie. while the PENDING bit is set.
+ *
+ * get_work_[g]cwq() can be used to obtain the gcwq or cwq
+ * corresponding to a work. gcwq is available once the work has been
+ * queued anywhere after initialization. cwq is available only from
+ * queueing until execution starts.
*/
-static inline void set_wq_data(struct work_struct *work,
- struct cpu_workqueue_struct *cwq)
+static inline void set_work_data(struct work_struct *work, unsigned long data,
+ unsigned long flags)
{
- unsigned long new;
-
BUG_ON(!work_pending(work));
+ atomic_long_set(&work->data, data | flags | work_static(work));
+}
+
+static void set_work_cwq(struct work_struct *work,
+ struct cpu_workqueue_struct *cwq,
+ unsigned long extra_flags)
+{
+ set_work_data(work, (unsigned long)cwq,
+ WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+}
+
+static void set_work_cpu(struct work_struct *work, unsigned int cpu)
+{
+ set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING);
+}
+
+static void clear_work_data(struct work_struct *work)
+{
+ set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+}
+
+static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+
+ if (data & WORK_STRUCT_CWQ)
+ return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
+ else
+ return NULL;
+}
+
+static struct global_cwq *get_work_gcwq(struct work_struct *work)
+{
+ unsigned long data = atomic_long_read(&work->data);
+ unsigned int cpu;
+
+ if (data & WORK_STRUCT_CWQ)
+ return ((struct cpu_workqueue_struct *)
+ (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+
+ cpu = data >> WORK_STRUCT_FLAG_BITS;
+ if (cpu == WORK_CPU_NONE)
+ return NULL;
+
+ BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+ return get_gcwq(cpu);
+}
+
+/*
+ * Policy functions. These define the policies on how the global
+ * worker pool is managed. Unless noted otherwise, these functions
+ * assume that they're being called with gcwq->lock held.
+ */
+
+static bool __need_more_worker(struct global_cwq *gcwq)
+{
+ return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+ gcwq->flags & GCWQ_HIGHPRI_PENDING;
+}
+
+/*
+ * Need to wake up a worker? Called from anything but currently
+ * running workers.
+ */
+static bool need_more_worker(struct global_cwq *gcwq)
+{
+ return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+}
+
+/* Can I start working? Called from busy but !running workers. */
+static bool may_start_working(struct global_cwq *gcwq)
+{
+ return gcwq->nr_idle;
+}
+
+/* Do I need to keep working? Called from currently running workers. */
+static bool keep_working(struct global_cwq *gcwq)
+{
+ atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+ return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+}
+
+/* Do we need a new worker? Called from manager. */
+static bool need_to_create_worker(struct global_cwq *gcwq)
+{
+ return need_more_worker(gcwq) && !may_start_working(gcwq);
+}
+
+/* Do I need to be the manager? */
+static bool need_to_manage_workers(struct global_cwq *gcwq)
+{
+ return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+}
+
+/* Do we have too many workers and should some go away? */
+static bool too_many_workers(struct global_cwq *gcwq)
+{
+ bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+ int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+ int nr_busy = gcwq->nr_workers - nr_idle;
- new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
- new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
- atomic_long_set(&work->data, new);
+ return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
/*
- * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
+ * Wake up functions.
*/
-static inline void clear_wq_data(struct work_struct *work)
+
+/* Return the first worker. Safe with preemption disabled */
+static struct worker *first_worker(struct global_cwq *gcwq)
{
- unsigned long flags = *work_data_bits(work) &
- (1UL << WORK_STRUCT_STATIC);
- atomic_long_set(&work->data, flags);
+ if (unlikely(list_empty(&gcwq->idle_list)))
+ return NULL;
+
+ return list_first_entry(&gcwq->idle_list, struct worker, entry);
}
-static inline
-struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
+/**
+ * wake_up_worker - wake up an idle worker
+ * @gcwq: gcwq to wake worker for
+ *
+ * Wake up the first idle worker of @gcwq.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void wake_up_worker(struct global_cwq *gcwq)
{
- return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
+ struct worker *worker = first_worker(gcwq);
+
+ if (likely(worker))
+ wake_up_process(worker->task);
+}
+
+/**
+ * wq_worker_waking_up - a worker is waking up
+ * @task: task waking up
+ * @cpu: CPU @task is waking up to
+ *
+ * This function is called during try_to_wake_up() when a worker is
+ * being awoken.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+{
+ struct worker *worker = kthread_data(task);
+
+ if (likely(!(worker->flags & WORKER_NOT_RUNNING)))
+ atomic_inc(get_gcwq_nr_running(cpu));
+}
+
+/**
+ * wq_worker_sleeping - a worker is going to sleep
+ * @task: task going to sleep
+ * @cpu: CPU in question, must be the current CPU number
+ *
+ * This function is called during schedule() when a busy worker is
+ * going to sleep. Worker on the same cpu can be woken up by
+ * returning pointer to its task.
+ *
+ * CONTEXT:
+ * spin_lock_irq(rq->lock)
+ *
+ * RETURNS:
+ * Worker task on @cpu to wake up, %NULL if none.
+ */
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+ unsigned int cpu)
+{
+ struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ atomic_t *nr_running = get_gcwq_nr_running(cpu);
+
+ if (unlikely(worker->flags & WORKER_NOT_RUNNING))
+ return NULL;
+
+ /* this can only happen on the local cpu */
+ BUG_ON(cpu != raw_smp_processor_id());
+
+ /*
+ * The counterpart of the following dec_and_test, implied mb,
+ * worklist not empty test sequence is in insert_work().
+ * Please read comment there.
+ *
+ * NOT_RUNNING is clear. This means that trustee is not in
+ * charge and we're running on the local cpu w/ rq lock held
+ * and preemption disabled, which in turn means that none else
+ * could be manipulating idle_list, so dereferencing idle_list
+ * without gcwq lock is safe.
+ */
+ if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+ to_wakeup = first_worker(gcwq);
+ return to_wakeup ? to_wakeup->task : NULL;
+}
+
+/**
+ * worker_set_flags - set worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to set
+ * @wakeup: wakeup an idle worker if necessary
+ *
+ * Set @flags in @worker->flags and adjust nr_running accordingly. If
+ * nr_running becomes zero and @wakeup is %true, an idle worker is
+ * woken up.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_set_flags(struct worker *worker, unsigned int flags,
+ bool wakeup)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ /*
+ * If transitioning into NOT_RUNNING, adjust nr_running and
+ * wake up an idle worker as necessary if requested by
+ * @wakeup.
+ */
+ if ((flags & WORKER_NOT_RUNNING) &&
+ !(worker->flags & WORKER_NOT_RUNNING)) {
+ atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+
+ if (wakeup) {
+ if (atomic_dec_and_test(nr_running) &&
+ !list_empty(&gcwq->worklist))
+ wake_up_worker(gcwq);
+ } else
+ atomic_dec(nr_running);
+ }
+
+ worker->flags |= flags;
}
+/**
+ * worker_clr_flags - clear worker flags and adjust nr_running accordingly
+ * @worker: self
+ * @flags: flags to clear
+ *
+ * Clear @flags in @worker->flags and adjust nr_running accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock)
+ */
+static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ unsigned int oflags = worker->flags;
+
+ WARN_ON_ONCE(worker->task != current);
+
+ worker->flags &= ~flags;
+
+ /* if transitioning out of NOT_RUNNING, increment nr_running */
+ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
+ if (!(worker->flags & WORKER_NOT_RUNNING))
+ atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+}
+
+/**
+ * busy_worker_head - return the busy hash head for a work
+ * @gcwq: gcwq of interest
+ * @work: work to be hashed
+ *
+ * Return hash head of @gcwq for @work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to the hash head.
+ */
+static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
+ struct work_struct *work)
+{
+ const int base_shift = ilog2(sizeof(struct work_struct));
+ unsigned long v = (unsigned long)work;
+
+ /* simple shift and fold hash, do we need something better? */
+ v >>= base_shift;
+ v += v >> BUSY_WORKER_HASH_ORDER;
+ v &= BUSY_WORKER_HASH_MASK;
+
+ return &gcwq->busy_hash[v];
+}
+
+/**
+ * __find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @bwh: hash head as returned by busy_worker_head()
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq. @bwh should be
+ * the hash head obtained by calling busy_worker_head() with the same
+ * work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+ struct hlist_head *bwh,
+ struct work_struct *work)
+{
+ struct worker *worker;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry(worker, tmp, bwh, hentry)
+ if (worker->current_work == work)
+ return worker;
+ return NULL;
+}
+
+/**
+ * find_worker_executing_work - find worker which is executing a work
+ * @gcwq: gcwq of interest
+ * @work: work to find worker for
+ *
+ * Find a worker which is executing @work on @gcwq. This function is
+ * identical to __find_worker_executing_work() except that this
+ * function calculates @bwh itself.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to worker which is executing @work if found, NULL
+ * otherwise.
+ */
+static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
+ struct work_struct *work)
+{
+ return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
+ work);
+}
+
+/**
+ * gcwq_determine_ins_pos - find insertion position
+ * @gcwq: gcwq of interest
+ * @cwq: cwq a work is being queued for
+ *
+ * A work for @cwq is about to be queued on @gcwq, determine insertion
+ * position for the work. If @cwq is for HIGHPRI wq, the work is
+ * queued at the head of the queue but in FIFO order with respect to
+ * other HIGHPRI works; otherwise, at the end of the queue. This
+ * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
+ * there are HIGHPRI works pending.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ *
+ * RETURNS:
+ * Pointer to inserstion position.
+ */
+static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
+ struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *twork;
+
+ if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
+ return &gcwq->worklist;
+
+ list_for_each_entry(twork, &gcwq->worklist, entry) {
+ struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
+
+ if (!(tcwq->wq->flags & WQ_HIGHPRI))
+ break;
+ }
+
+ gcwq->flags |= GCWQ_HIGHPRI_PENDING;
+ return &twork->entry;
+}
+
+/**
+ * insert_work - insert a work into gcwq
+ * @cwq: cwq @work belongs to
+ * @work: work to insert
+ * @head: insertion point
+ * @extra_flags: extra WORK_STRUCT_* flags to set
+ *
+ * Insert @work which belongs to @cwq into @gcwq after @head.
+ * @extra_flags is or'd to work_struct flags.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
static void insert_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work, struct list_head *head)
+ struct work_struct *work, struct list_head *head,
+ unsigned int extra_flags)
{
- trace_workqueue_insertion(cwq->thread, work);
+ struct global_cwq *gcwq = cwq->gcwq;
+
+ /* we own @work, set data and link */
+ set_work_cwq(work, cwq, extra_flags);
- set_wq_data(work, cwq);
/*
* Ensure that we get the right work->data if we see the
* result of list_add() below, see try_to_grab_pending().
*/
smp_wmb();
+
list_add_tail(&work->entry, head);
- wake_up(&cwq->more_work);
+
+ /*
+ * Ensure either worker_sched_deactivated() sees the above
+ * list_add_tail() or we see zero nr_running to avoid workers
+ * lying around lazily while there are works to be processed.
+ */
+ smp_mb();
+
+ if (__need_more_worker(gcwq))
+ wake_up_worker(gcwq);
}
-static void __queue_work(struct cpu_workqueue_struct *cwq,
+static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
+ struct global_cwq *gcwq;
+ struct cpu_workqueue_struct *cwq;
+ struct list_head *worklist;
unsigned long flags;
debug_work_activate(work);
- spin_lock_irqsave(&cwq->lock, flags);
- insert_work(cwq, work, &cwq->worklist);
- spin_unlock_irqrestore(&cwq->lock, flags);
+
+ /* determine gcwq to use */
+ if (!(wq->flags & WQ_UNBOUND)) {
+ struct global_cwq *last_gcwq;
+
+ if (unlikely(cpu == WORK_CPU_UNBOUND))
+ cpu = raw_smp_processor_id();
+
+ /*
+ * It's multi cpu. If @wq is non-reentrant and @work
+ * was previously on a different cpu, it might still
+ * be running there, in which case the work needs to
+ * be queued on that cpu to guarantee non-reentrance.
+ */
+ gcwq = get_gcwq(cpu);
+ if (wq->flags & WQ_NON_REENTRANT &&
+ (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) {
+ struct worker *worker;
+
+ spin_lock_irqsave(&last_gcwq->lock, flags);
+
+ worker = find_worker_executing_work(last_gcwq, work);
+
+ if (worker && worker->current_cwq->wq == wq)
+ gcwq = last_gcwq;
+ else {
+ /* meh... not running there, queue here */
+ spin_unlock_irqrestore(&last_gcwq->lock, flags);
+ spin_lock_irqsave(&gcwq->lock, flags);
+ }
+ } else
+ spin_lock_irqsave(&gcwq->lock, flags);
+ } else {
+ gcwq = get_gcwq(WORK_CPU_UNBOUND);
+ spin_lock_irqsave(&gcwq->lock, flags);
+ }
+
+ /* gcwq determined, get cwq and queue */
+ cwq = get_cwq(gcwq->cpu, wq);
+
+ BUG_ON(!list_empty(&work->entry));
+
+ cwq->nr_in_flight[cwq->work_color]++;
+
+ if (likely(cwq->nr_active < cwq->max_active)) {
+ cwq->nr_active++;
+ worklist = gcwq_determine_ins_pos(gcwq, cwq);
+ } else
+ worklist = &cwq->delayed_works;
+
+ insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
+
+ spin_unlock_irqrestore(&gcwq->lock, flags);
}
/**
@@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
- BUG_ON(!list_empty(&work->entry));
- __queue_work(wq_per_cpu(wq, cpu), work);
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ __queue_work(cpu, wq, work);
ret = 1;
}
return ret;
@@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on);
static void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
- struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
- struct workqueue_struct *wq = cwq->wq;
+ struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
- __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
+ __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
}
/**
@@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ unsigned int lcpu;
+
BUG_ON(timer_pending(timer));
BUG_ON(!list_empty(&work->entry));
timer_stats_timer_set_start_info(&dwork->timer);
- /* This stores cwq for the moment, for the timer_fn */
- set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
+ /*
+ * This stores cwq for the moment, for the timer_fn.
+ * Note that the work's gcwq is preserved to allow
+ * reentrance detection for delayed works.
+ */
+ if (!(wq->flags & WQ_UNBOUND)) {
+ struct global_cwq *gcwq = get_work_gcwq(work);
+
+ if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND)
+ lcpu = gcwq->cpu;
+ else
+ lcpu = raw_smp_processor_id();
+ } else
+ lcpu = WORK_CPU_UNBOUND;
+
+ set_work_cwq(work, get_cwq(lcpu, wq), 0);
+
timer->expires = jiffies + delay;
timer->data = (unsigned long)dwork;
timer->function = delayed_work_timer_fn;
@@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
-static void run_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
{
- spin_lock_irq(&cwq->lock);
- while (!list_empty(&cwq->worklist)) {
- struct work_struct *work = list_entry(cwq->worklist.next,
- struct work_struct, entry);
- work_func_t f = work->func;
-#ifdef CONFIG_LOCKDEP
+ struct global_cwq *gcwq = worker->gcwq;
+
+ BUG_ON(worker->flags & WORKER_IDLE);
+ BUG_ON(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev));
+
+ /* can't use worker_set_flags(), also called from start_worker() */
+ worker->flags |= WORKER_IDLE;
+ gcwq->nr_idle++;
+ worker->last_active = jiffies;
+
+ /* idle_list is LIFO */
+ list_add(&worker->entry, &gcwq->idle_list);
+
+ if (likely(!(worker->flags & WORKER_ROGUE))) {
+ if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+ mod_timer(&gcwq->idle_timer,
+ jiffies + IDLE_WORKER_TIMEOUT);
+ } else
+ wake_up_all(&gcwq->trustee_wait);
+
+ /* sanity check nr_running */
+ WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+ atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+}
+
+/**
+ * worker_leave_idle - leave idle state
+ * @worker: worker which is leaving idle state
+ *
+ * @worker is leaving idle state. Update stats.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void worker_leave_idle(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+
+ BUG_ON(!(worker->flags & WORKER_IDLE));
+ worker_clr_flags(worker, WORKER_IDLE);
+ gcwq->nr_idle--;
+ list_del_init(&worker->entry);
+}
+
+/**
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * @worker: self
+ *
+ * Works which are scheduled while the cpu is online must at least be
+ * scheduled to a worker which is bound to the cpu so that if they are
+ * flushed from cpu callbacks while cpu is going down, they are
+ * guaranteed to execute on the cpu.
+ *
+ * This function is to be used by rogue workers and rescuers to bind
+ * themselves to the target cpu and may race with cpu going down or
+ * coming online. kthread_bind() can't be used because it may put the
+ * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
+ * verbatim as it's best effort and blocking and gcwq may be
+ * [dis]associated in the meantime.
+ *
+ * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * the binding against GCWQ_DISASSOCIATED which is set during
+ * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * idle state or fetches works without dropping lock, it can guarantee
+ * the scheduling requirement described in the first paragraph.
+ *
+ * CONTEXT:
+ * Might sleep. Called without any lock but returns with gcwq->lock
+ * held.
+ *
+ * RETURNS:
+ * %true if the associated gcwq is online (@worker is successfully
+ * bound), %false if offline.
+ */
+static bool worker_maybe_bind_and_lock(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ struct task_struct *task = worker->task;
+
+ while (true) {
/*
- * It is permissible to free the struct work_struct
- * from inside the function that is called from it,
- * this we need to take into account for lockdep too.
- * To avoid bogus "held lock freed" warnings as well
- * as problems when looking into work->lockdep_map,
- * make a copy and use that here.
+ * The following call may fail, succeed or succeed
+ * without actually migrating the task to the cpu if
+ * it races with cpu hotunplug operation. Verify
+ * against GCWQ_DISASSOCIATED.
*/
- struct lockdep_map lockdep_map = work->lockdep_map;
-#endif
- trace_workqueue_execution(cwq->thread, work);
- debug_work_deactivate(work);
- cwq->current_work = work;
- list_del_init(cwq->worklist.next);
- spin_unlock_irq(&cwq->lock);
-
- BUG_ON(get_wq_data(work) != cwq);
- work_clear_pending(work);
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_acquire(&lockdep_map);
- f(work);
- lock_map_release(&lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
-
- if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
- printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(),
- task_pid_nr(current));
- printk(KERN_ERR " last function: ");
- print_symbol("%s\n", (unsigned long)f);
- debug_show_held_locks(current);
- dump_stack();
+ if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+ set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+
+ spin_lock_irq(&gcwq->lock);
+ if (gcwq->flags & GCWQ_DISASSOCIATED)
+ return false;
+ if (task_cpu(task) == gcwq->cpu &&
+ cpumask_equal(&current->cpus_allowed,
+ get_cpu_mask(gcwq->cpu)))
+ return true;
+ spin_unlock_irq(&gcwq->lock);
+
+ /* CPU has come up inbetween, retry migration */
+ cpu_relax();
+ }
+}
+
+/*
+ * Function for worker->rebind_work used to rebind rogue busy workers
+ * to the associated cpu which is coming back online. This is
+ * scheduled by cpu up but can race with other cpu hotplug operations
+ * and may be executed twice without intervening cpu down.
+ */
+static void worker_rebind_fn(struct work_struct *work)
+{
+ struct worker *worker = container_of(work, struct worker, rebind_work);
+ struct global_cwq *gcwq = worker->gcwq;
+
+ if (worker_maybe_bind_and_lock(worker))
+ worker_clr_flags(worker, WORKER_REBIND);
+
+ spin_unlock_irq(&gcwq->lock);
+}
+
+static struct worker *alloc_worker(void)
+{
+ struct worker *worker;
+
+ worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ if (worker) {
+ INIT_LIST_HEAD(&worker->entry);
+ INIT_LIST_HEAD(&worker->scheduled);
+ INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+ /* on creation a worker is in !idle && prep state */
+ worker->flags = WORKER_PREP;
+ }
+ return worker;
+}
+
+/**
+ * create_worker - create a new workqueue worker
+ * @gcwq: gcwq the new worker will belong to
+ * @bind: whether to set affinity to @cpu or not
+ *
+ * Create a new worker which is bound to @gcwq. The returned worker
+ * can be started by calling start_worker() or destroyed using
+ * destroy_worker().
+ *
+ * CONTEXT:
+ * Might sleep. Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * Pointer to the newly created worker.
+ */
+static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+{
+ bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+ struct worker *worker = NULL;
+ int id = -1;
+
+ spin_lock_irq(&gcwq->lock);
+ while (ida_get_new(&gcwq->worker_ida, &id)) {
+ spin_unlock_irq(&gcwq->lock);
+ if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+ goto fail;
+ spin_lock_irq(&gcwq->lock);
+ }
+ spin_unlock_irq(&gcwq->lock);
+
+ worker = alloc_worker();
+ if (!worker)
+ goto fail;
+
+ worker->gcwq = gcwq;
+ worker->id = id;
+
+ if (!on_unbound_cpu)
+ worker->task = kthread_create(worker_thread, worker,
+ "kworker/%u:%d", gcwq->cpu, id);
+ else
+ worker->task = kthread_create(worker_thread, worker,
+ "kworker/u:%d", id);
+ if (IS_ERR(worker->task))
+ goto fail;
+
+ /*
+ * A rogue worker will become a regular one if CPU comes
+ * online later on. Make sure every worker has
+ * PF_THREAD_BOUND set.
+ */
+ if (bind && !on_unbound_cpu)
+ kthread_bind(worker->task, gcwq->cpu);
+ else {
+ worker->task->flags |= PF_THREAD_BOUND;
+ if (on_unbound_cpu)
+ worker->flags |= WORKER_UNBOUND;
+ }
+
+ return worker;
+fail:
+ if (id >= 0) {
+ spin_lock_irq(&gcwq->lock);
+ ida_remove(&gcwq->worker_ida, id);
+ spin_unlock_irq(&gcwq->lock);
+ }
+ kfree(worker);
+ return NULL;
+}
+
+/**
+ * start_worker - start a newly created worker
+ * @worker: worker to start
+ *
+ * Make the gcwq aware of @worker and start it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void start_worker(struct worker *worker)
+{
+ worker->flags |= WORKER_STARTED;
+ worker->gcwq->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+}
+
+/**
+ * destroy_worker - destroy a workqueue worker
+ * @worker: worker to be destroyed
+ *
+ * Destroy @worker and adjust @gcwq stats accordingly.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void destroy_worker(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ int id = worker->id;
+
+ /* sanity check frenzy */
+ BUG_ON(worker->current_work);
+ BUG_ON(!list_empty(&worker->scheduled));
+
+ if (worker->flags & WORKER_STARTED)
+ gcwq->nr_workers--;
+ if (worker->flags & WORKER_IDLE)
+ gcwq->nr_idle--;
+
+ list_del_init(&worker->entry);
+ worker->flags |= WORKER_DIE;
+
+ spin_unlock_irq(&gcwq->lock);
+
+ kthread_stop(worker->task);
+ kfree(worker);
+
+ spin_lock_irq(&gcwq->lock);
+ ida_remove(&gcwq->worker_ida, id);
+}
+
+static void idle_worker_timeout(unsigned long __gcwq)
+{
+ struct global_cwq *gcwq = (void *)__gcwq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (too_many_workers(gcwq)) {
+ struct worker *worker;
+ unsigned long expires;
+
+ /* idle_list is kept in LIFO order, check the last one */
+ worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
+
+ if (time_before(jiffies, expires))
+ mod_timer(&gcwq->idle_timer, expires);
+ else {
+ /* it's been idle for too long, wake up manager */
+ gcwq->flags |= GCWQ_MANAGE_WORKERS;
+ wake_up_worker(gcwq);
+ }
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+}
+
+static bool send_mayday(struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct workqueue_struct *wq = cwq->wq;
+ unsigned int cpu;
+
+ if (!(wq->flags & WQ_RESCUER))
+ return false;
+
+ /* mayday mayday mayday */
+ cpu = cwq->gcwq->cpu;
+ /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = 0;
+ if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+ wake_up_process(wq->rescuer->task);
+ return true;
+}
+
+static void gcwq_mayday_timeout(unsigned long __gcwq)
+{
+ struct global_cwq *gcwq = (void *)__gcwq;
+ struct work_struct *work;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (need_to_create_worker(gcwq)) {
+ /*
+ * We've been trying to create a new worker but
+ * haven't been successful. We might be hitting an
+ * allocation deadlock. Send distress signals to
+ * rescuers.
+ */
+ list_for_each_entry(work, &gcwq->worklist, entry)
+ send_mayday(work);
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+
+ mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+}
+
+/**
+ * maybe_create_worker - create a new worker if necessary
+ * @gcwq: gcwq to create a new worker for
+ *
+ * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to
+ * have at least one idle worker on return from this function. If
+ * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
+ * sent to all rescuers with works scheduled on @gcwq to resolve
+ * possible allocation deadlock.
+ *
+ * On return, need_to_create_worker() is guaranteed to be false and
+ * may_start_working() true.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations. Called only from
+ * manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_create_worker(struct global_cwq *gcwq)
+{
+ if (!need_to_create_worker(gcwq))
+ return false;
+restart:
+ spin_unlock_irq(&gcwq->lock);
+
+ /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
+ mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+
+ while (true) {
+ struct worker *worker;
+
+ worker = create_worker(gcwq, true);
+ if (worker) {
+ del_timer_sync(&gcwq->mayday_timer);
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ BUG_ON(need_to_create_worker(gcwq));
+ return true;
}
- spin_lock_irq(&cwq->lock);
- cwq->current_work = NULL;
+ if (!need_to_create_worker(gcwq))
+ break;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(CREATE_COOLDOWN);
+
+ if (!need_to_create_worker(gcwq))
+ break;
}
- spin_unlock_irq(&cwq->lock);
+
+ del_timer_sync(&gcwq->mayday_timer);
+ spin_lock_irq(&gcwq->lock);
+ if (need_to_create_worker(gcwq))
+ goto restart;
+ return true;
}
-static int worker_thread(void *__cwq)
+/**
+ * maybe_destroy_worker - destroy workers which have been idle for a while
+ * @gcwq: gcwq to destroy workers for
+ *
+ * Destroy @gcwq workers which have been idle for longer than
+ * IDLE_WORKER_TIMEOUT.
+ *
+ * LOCKING:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Called only from manager.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true
+ * otherwise.
+ */
+static bool maybe_destroy_workers(struct global_cwq *gcwq)
{
- struct cpu_workqueue_struct *cwq = __cwq;
- DEFINE_WAIT(wait);
+ bool ret = false;
- if (cwq->wq->freezeable)
- set_freezable();
+ while (too_many_workers(gcwq)) {
+ struct worker *worker;
+ unsigned long expires;
- for (;;) {
- prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
- if (!freezing(current) &&
- !kthread_should_stop() &&
- list_empty(&cwq->worklist))
- schedule();
- finish_wait(&cwq->more_work, &wait);
+ worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+ expires = worker->last_active + IDLE_WORKER_TIMEOUT;
- try_to_freeze();
+ if (time_before(jiffies, expires)) {
+ mod_timer(&gcwq->idle_timer, expires);
+ break;
+ }
- if (kthread_should_stop())
+ destroy_worker(worker);
+ ret = true;
+ }
+
+ return ret;
+}
+
+/**
+ * manage_workers - manage worker pool
+ * @worker: self
+ *
+ * Assume the manager role and manage gcwq worker pool @worker belongs
+ * to. At any given time, there can be only zero or one manager per
+ * gcwq. The exclusion is handled automatically by this function.
+ *
+ * The caller can safely start processing works on false return. On
+ * true return, it's guaranteed that need_to_create_worker() is false
+ * and may_start_working() is true.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. Does GFP_KERNEL allocations.
+ *
+ * RETURNS:
+ * false if no action was taken and gcwq->lock stayed locked, true if
+ * some action was taken.
+ */
+static bool manage_workers(struct worker *worker)
+{
+ struct global_cwq *gcwq = worker->gcwq;
+ bool ret = false;
+
+ if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+ return ret;
+
+ gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+ gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+ /*
+ * Destroy and then create so that may_start_working() is true
+ * on return.
+ */
+ ret |= maybe_destroy_workers(gcwq);
+ ret |= maybe_create_worker(gcwq);
+
+ gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+ /*
+ * The trustee might be waiting to take over the manager
+ * position, tell it we're done.
+ */
+ if (unlikely(gcwq->trustee))
+ wake_up_all(&gcwq->trustee_wait);
+
+ return ret;
+}
+
+/**
+ * move_linked_works - move linked works to a list
+ * @work: start of series of works to be scheduled
+ * @head: target list to append @work to
+ * @nextp: out paramter for nested worklist walking
+ *
+ * Schedule linked works starting from @work to @head. Work series to
+ * be scheduled starts at @work and includes any consecutive work with
+ * WORK_STRUCT_LINKED set in its predecessor.
+ *
+ * If @nextp is not NULL, it's updated to point to the next work of
+ * the last scheduled work. This allows move_linked_works() to be
+ * nested inside outer list_for_each_entry_safe().
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void move_linked_works(struct work_struct *work, struct list_head *head,
+ struct work_struct **nextp)
+{
+ struct work_struct *n;
+
+ /*
+ * Linked worklist will always end before the end of the list,
+ * use NULL for list head.
+ */
+ list_for_each_entry_safe_from(work, n, NULL, entry) {
+ list_move_tail(&work->entry, head);
+ if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
break;
+ }
+
+ /*
+ * If we're already inside safe list traversal and have moved
+ * multiple works to the scheduled queue, the next position
+ * needs to be updated.
+ */
+ if (nextp)
+ *nextp = n;
+}
+
+static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+{
+ struct work_struct *work = list_first_entry(&cwq->delayed_works,
+ struct work_struct, entry);
+ struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+
+ move_linked_works(work, pos, NULL);
+ cwq->nr_active++;
+}
- run_workqueue(cwq);
+/**
+ * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * @cwq: cwq of interest
+ * @color: color of work which left the queue
+ *
+ * A work either has completed or is removed from pending queue,
+ * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
+static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+{
+ /* ignore uncolored works */
+ if (color == WORK_NO_COLOR)
+ return;
+
+ cwq->nr_in_flight[color]--;
+ cwq->nr_active--;
+
+ if (!list_empty(&cwq->delayed_works)) {
+ /* one down, submit a delayed one */
+ if (cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
}
- return 0;
+ /* is flush in progress and are we at the flushing tip? */
+ if (likely(cwq->flush_color != color))
+ return;
+
+ /* are there still in-flight works? */
+ if (cwq->nr_in_flight[color])
+ return;
+
+ /* this cwq is done, clear flush_color */
+ cwq->flush_color = -1;
+
+ /*
+ * If this was the last cwq, wake up the first flusher. It
+ * will handle the rest.
+ */
+ if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+ complete(&cwq->wq->first_flusher->done);
+}
+
+/**
+ * process_one_work - process single work
+ * @worker: self
+ * @work: work to process
+ *
+ * Process @work. This function contains all the logics necessary to
+ * process a single work including synchronization against and
+ * interaction with other workers on the same cpu, queueing and
+ * flushing. As long as context requirement is met, any worker can
+ * call this function to process a work.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ */
+static void process_one_work(struct worker *worker, struct work_struct *work)
+{
+ struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+ struct global_cwq *gcwq = cwq->gcwq;
+ struct hlist_head *bwh = busy_worker_head(gcwq, work);
+ bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
+ work_func_t f = work->func;
+ int work_color;
+ struct worker *collision;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * It is permissible to free the struct work_struct from
+ * inside the function that is called from it, this we need to
+ * take into account for lockdep too. To avoid bogus "held
+ * lock freed" warnings as well as problems when looking into
+ * work->lockdep_map, make a copy and use that here.
+ */
+ struct lockdep_map lockdep_map = work->lockdep_map;
+#endif
+ /*
+ * A single work shouldn't be executed concurrently by
+ * multiple workers on a single cpu. Check whether anyone is
+ * already processing the work. If so, defer the work to the
+ * currently executing one.
+ */
+ collision = __find_worker_executing_work(gcwq, bwh, work);
+ if (unlikely(collision)) {
+ move_linked_works(work, &collision->scheduled, NULL);
+ return;
+ }
+
+ /* claim and process */
+ debug_work_deactivate(work);
+ hlist_add_head(&worker->hentry, bwh);
+ worker->current_work = work;
+ worker->current_cwq = cwq;
+ work_color = get_work_color(work);
+
+ /* record the current cpu number in the work data and dequeue */
+ set_work_cpu(work, gcwq->cpu);
+ list_del_init(&work->entry);
+
+ /*
+ * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
+ * wake up another worker; otherwise, clear HIGHPRI_PENDING.
+ */
+ if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
+ struct work_struct *nwork = list_first_entry(&gcwq->worklist,
+ struct work_struct, entry);
+
+ if (!list_empty(&gcwq->worklist) &&
+ get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
+ wake_up_worker(gcwq);
+ else
+ gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
+ }
+
+ /*
+ * CPU intensive works don't participate in concurrency
+ * management. They're the scheduler's responsibility.
+ */
+ if (unlikely(cpu_intensive))
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+
+ spin_unlock_irq(&gcwq->lock);
+
+ work_clear_pending(work);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire(&lockdep_map);
+ f(work);
+ lock_map_release(&lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
+
+ if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+ printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+ "%s/0x%08x/%d\n",
+ current->comm, preempt_count(), task_pid_nr(current));
+ printk(KERN_ERR " last function: ");
+ print_symbol("%s\n", (unsigned long)f);
+ debug_show_held_locks(current);
+ dump_stack();
+ }
+
+ spin_lock_irq(&gcwq->lock);
+
+ /* clear cpu intensive status */
+ if (unlikely(cpu_intensive))
+ worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
+
+ /* we're done with it, release */
+ hlist_del_init(&worker->hentry);
+ worker->current_work = NULL;
+ worker->current_cwq = NULL;
+ cwq_dec_nr_in_flight(cwq, work_color);
+}
+
+/**
+ * process_scheduled_works - process scheduled works
+ * @worker: self
+ *
+ * Process all scheduled works. Please note that the scheduled list
+ * may change while processing a work, so this function repeatedly
+ * fetches a work from the top and executes it.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times.
+ */
+static void process_scheduled_works(struct worker *worker)
+{
+ while (!list_empty(&worker->scheduled)) {
+ struct work_struct *work = list_first_entry(&worker->scheduled,
+ struct work_struct, entry);
+ process_one_work(worker, work);
+ }
+}
+
+/**
+ * worker_thread - the worker thread function
+ * @__worker: self
+ *
+ * The gcwq worker thread function. There's a single dynamic pool of
+ * these per each cpu. These workers process all works regardless of
+ * their specific target workqueue. The only exception is works which
+ * belong to workqueues with a rescuer which will be explained in
+ * rescuer_thread().
+ */
+static int worker_thread(void *__worker)
+{
+ struct worker *worker = __worker;
+ struct global_cwq *gcwq = worker->gcwq;
+
+ /* tell the scheduler that this is a workqueue worker */
+ worker->task->flags |= PF_WQ_WORKER;
+woke_up:
+ spin_lock_irq(&gcwq->lock);
+
+ /* DIE can be set only while we're idle, checking here is enough */
+ if (worker->flags & WORKER_DIE) {
+ spin_unlock_irq(&gcwq->lock);
+ worker->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
+ worker_leave_idle(worker);
+recheck:
+ /* no more worker necessary? */
+ if (!need_more_worker(gcwq))
+ goto sleep;
+
+ /* do we need to manage? */
+ if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+ goto recheck;
+
+ /*
+ * ->scheduled list can only be filled while a worker is
+ * preparing to process a work or actually processing it.
+ * Make sure nobody diddled with it while I was sleeping.
+ */
+ BUG_ON(!list_empty(&worker->scheduled));
+
+ /*
+ * When control reaches this point, we're guaranteed to have
+ * at least one idle worker or that someone else has already
+ * assumed the manager role.
+ */
+ worker_clr_flags(worker, WORKER_PREP);
+
+ do {
+ struct work_struct *work =
+ list_first_entry(&gcwq->worklist,
+ struct work_struct, entry);
+
+ if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
+ /* optimization path, not strictly necessary */
+ process_one_work(worker, work);
+ if (unlikely(!list_empty(&worker->scheduled)))
+ process_scheduled_works(worker);
+ } else {
+ move_linked_works(work, &worker->scheduled, NULL);
+ process_scheduled_works(worker);
+ }
+ } while (keep_working(gcwq));
+
+ worker_set_flags(worker, WORKER_PREP, false);
+sleep:
+ if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+ goto recheck;
+
+ /*
+ * gcwq->lock is held and there's no work to process and no
+ * need to manage, sleep. Workers are woken up only while
+ * holding gcwq->lock or from local cpu, so setting the
+ * current state before releasing gcwq->lock is enough to
+ * prevent losing any event.
+ */
+ worker_enter_idle(worker);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&gcwq->lock);
+ schedule();
+ goto woke_up;
+}
+
+/**
+ * rescuer_thread - the rescuer thread function
+ * @__wq: the associated workqueue
+ *
+ * Workqueue rescuer thread function. There's one rescuer for each
+ * workqueue which has WQ_RESCUER set.
+ *
+ * Regular work processing on a gcwq may block trying to create a new
+ * worker which uses GFP_KERNEL allocation which has slight chance of
+ * developing into deadlock if some works currently on the same queue
+ * need to be processed to satisfy the GFP_KERNEL allocation. This is
+ * the problem rescuer solves.
+ *
+ * When such condition is possible, the gcwq summons rescuers of all
+ * workqueues which have works queued on the gcwq and let them process
+ * those works so that forward progress can be guaranteed.
+ *
+ * This should happen rarely.
+ */
+static int rescuer_thread(void *__wq)
+{
+ struct workqueue_struct *wq = __wq;
+ struct worker *rescuer = wq->rescuer;
+ struct list_head *scheduled = &rescuer->scheduled;
+ bool is_unbound = wq->flags & WQ_UNBOUND;
+ unsigned int cpu;
+
+ set_user_nice(current, RESCUER_NICE_LEVEL);
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (kthread_should_stop())
+ return 0;
+
+ /*
+ * See whether any cpu is asking for help. Unbounded
+ * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
+ */
+ for_each_mayday_cpu(cpu, wq->mayday_mask) {
+ unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+ struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+ struct global_cwq *gcwq = cwq->gcwq;
+ struct work_struct *work, *n;
+
+ __set_current_state(TASK_RUNNING);
+ mayday_clear_cpu(cpu, wq->mayday_mask);
+
+ /* migrate to the target cpu if possible */
+ rescuer->gcwq = gcwq;
+ worker_maybe_bind_and_lock(rescuer);
+
+ /*
+ * Slurp in all works issued via this workqueue and
+ * process'em.
+ */
+ BUG_ON(!list_empty(&rescuer->scheduled));
+ list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+ if (get_work_cwq(work) == cwq)
+ move_linked_works(work, scheduled, &n);
+
+ process_scheduled_works(rescuer);
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ schedule();
+ goto repeat;
}
struct wq_barrier {
@@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work)
complete(&barr->done);
}
+/**
+ * insert_wq_barrier - insert a barrier work
+ * @cwq: cwq to insert barrier into
+ * @barr: wq_barrier to insert
+ * @target: target work to attach @barr to
+ * @worker: worker currently executing @target, NULL if @target is not executing
+ *
+ * @barr is linked to @target such that @barr is completed only after
+ * @target finishes execution. Please note that the ordering
+ * guarantee is observed only with respect to @target and on the local
+ * cpu.
+ *
+ * Currently, a queued barrier can't be canceled. This is because
+ * try_to_grab_pending() can't determine whether the work to be
+ * grabbed is at the head of the queue and thus can't clear LINKED
+ * flag of the previous work while there must be a valid next work
+ * after a work with LINKED flag set.
+ *
+ * Note that when @worker is non-NULL, @target may be modified
+ * underneath us, so we can't reliably determine cwq from @target.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock).
+ */
static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
- struct wq_barrier *barr, struct list_head *head)
+ struct wq_barrier *barr,
+ struct work_struct *target, struct worker *worker)
{
+ struct list_head *head;
+ unsigned int linked = 0;
+
/*
- * debugobject calls are safe here even with cwq->lock locked
+ * debugobject calls are safe here even with gcwq->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
*/
INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
- __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
-
+ __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ /*
+ * If @target is currently being executed, schedule the
+ * barrier to the worker; otherwise, put it after @target.
+ */
+ if (worker)
+ head = worker->scheduled.next;
+ else {
+ unsigned long *bits = work_data_bits(target);
+
+ head = target->entry.next;
+ /* there can already be other linked works, inherit and set */
+ linked = *bits & WORK_STRUCT_LINKED;
+ __set_bit(WORK_STRUCT_LINKED_BIT, bits);
+ }
+
debug_work_activate(&barr->work);
- insert_work(cwq, &barr->work, head);
+ insert_work(cwq, &barr->work, head,
+ work_color_to_flags(WORK_NO_COLOR) | linked);
}
-static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+/**
+ * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * @wq: workqueue being flushed
+ * @flush_color: new flush color, < 0 for no-op
+ * @work_color: new work color, < 0 for no-op
+ *
+ * Prepare cwqs for workqueue flushing.
+ *
+ * If @flush_color is non-negative, flush_color on all cwqs should be
+ * -1. If no cwq has in-flight commands at the specified color, all
+ * cwq->flush_color's stay at -1 and %false is returned. If any cwq
+ * has in flight commands, its cwq->flush_color is set to
+ * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * wakeup logic is armed and %true is returned.
+ *
+ * The caller should have initialized @wq->first_flusher prior to
+ * calling this function with non-negative @flush_color. If
+ * @flush_color is negative, no flush color update is done and %false
+ * is returned.
+ *
+ * If @work_color is non-negative, all cwqs should have the same
+ * work_color which is previous to @work_color and all will be
+ * advanced to @work_color.
+ *
+ * CONTEXT:
+ * mutex_lock(wq->flush_mutex).
+ *
+ * RETURNS:
+ * %true if @flush_color >= 0 and there's something to flush. %false
+ * otherwise.
+ */
+static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+ int flush_color, int work_color)
{
- int active = 0;
- struct wq_barrier barr;
+ bool wait = false;
+ unsigned int cpu;
- WARN_ON(cwq->thread == current);
-
- spin_lock_irq(&cwq->lock);
- if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
- insert_wq_barrier(cwq, &barr, &cwq->worklist);
- active = 1;
+ if (flush_color >= 0) {
+ BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+ atomic_set(&wq->nr_cwqs_to_flush, 1);
}
- spin_unlock_irq(&cwq->lock);
- if (active) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct global_cwq *gcwq = cwq->gcwq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (flush_color >= 0) {
+ BUG_ON(cwq->flush_color != -1);
+
+ if (cwq->nr_in_flight[flush_color]) {
+ cwq->flush_color = flush_color;
+ atomic_inc(&wq->nr_cwqs_to_flush);
+ wait = true;
+ }
+ }
+
+ if (work_color >= 0) {
+ BUG_ON(work_color != work_next_color(cwq->work_color));
+ cwq->work_color = work_color;
+ }
+
+ spin_unlock_irq(&gcwq->lock);
}
- return active;
+ if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+ complete(&wq->first_flusher->done);
+
+ return wait;
}
/**
@@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
*
* We sleep until all works which were queued on entry have been handled,
* but we are not livelocked by new incoming ones.
- *
- * This function used to run the workqueues itself. Now we just wait for the
- * helper threads to do it.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
- const struct cpumask *cpu_map = wq_cpu_map(wq);
- int cpu;
+ struct wq_flusher this_flusher = {
+ .list = LIST_HEAD_INIT(this_flusher.list),
+ .flush_color = -1,
+ .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ };
+ int next_color;
- might_sleep();
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
- for_each_cpu(cpu, cpu_map)
- flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+
+ mutex_lock(&wq->flush_mutex);
+
+ /*
+ * Start-to-wait phase
+ */
+ next_color = work_next_color(wq->work_color);
+
+ if (next_color != wq->flush_color) {
+ /*
+ * Color space is not full. The current work_color
+ * becomes our flush_color and work_color is advanced
+ * by one.
+ */
+ BUG_ON(!list_empty(&wq->flusher_overflow));
+ this_flusher.flush_color = wq->work_color;
+ wq->work_color = next_color;
+
+ if (!wq->first_flusher) {
+ /* no flush in progress, become the first flusher */
+ BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+ wq->first_flusher = &this_flusher;
+
+ if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+ wq->work_color)) {
+ /* nothing to flush, done */
+ wq->flush_color = next_color;
+ wq->first_flusher = NULL;
+ goto out_unlock;
+ }
+ } else {
+ /* wait in queue */
+ BUG_ON(wq->flush_color == this_flusher.flush_color);
+ list_add_tail(&this_flusher.list, &wq->flusher_queue);
+ flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ }
+ } else {
+ /*
+ * Oops, color space is full, wait on overflow queue.
+ * The next flush completion will assign us
+ * flush_color and transfer to flusher_queue.
+ */
+ list_add_tail(&this_flusher.list, &wq->flusher_overflow);
+ }
+
+ mutex_unlock(&wq->flush_mutex);
+
+ wait_for_completion(&this_flusher.done);
+
+ /*
+ * Wake-up-and-cascade phase
+ *
+ * First flushers are responsible for cascading flushes and
+ * handling overflow. Non-first flushers can simply return.
+ */
+ if (wq->first_flusher != &this_flusher)
+ return;
+
+ mutex_lock(&wq->flush_mutex);
+
+ /* we might have raced, check again with mutex held */
+ if (wq->first_flusher != &this_flusher)
+ goto out_unlock;
+
+ wq->first_flusher = NULL;
+
+ BUG_ON(!list_empty(&this_flusher.list));
+ BUG_ON(wq->flush_color != this_flusher.flush_color);
+
+ while (true) {
+ struct wq_flusher *next, *tmp;
+
+ /* complete all the flushers sharing the current flush color */
+ list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
+ if (next->flush_color != wq->flush_color)
+ break;
+ list_del_init(&next->list);
+ complete(&next->done);
+ }
+
+ BUG_ON(!list_empty(&wq->flusher_overflow) &&
+ wq->flush_color != work_next_color(wq->work_color));
+
+ /* this flush_color is finished, advance by one */
+ wq->flush_color = work_next_color(wq->flush_color);
+
+ /* one color has been freed, handle overflow queue */
+ if (!list_empty(&wq->flusher_overflow)) {
+ /*
+ * Assign the same color to all overflowed
+ * flushers, advance work_color and append to
+ * flusher_queue. This is the start-to-wait
+ * phase for these overflowed flushers.
+ */
+ list_for_each_entry(tmp, &wq->flusher_overflow, list)
+ tmp->flush_color = wq->work_color;
+
+ wq->work_color = work_next_color(wq->work_color);
+
+ list_splice_tail_init(&wq->flusher_overflow,
+ &wq->flusher_queue);
+ flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+ }
+
+ if (list_empty(&wq->flusher_queue)) {
+ BUG_ON(wq->flush_color != wq->work_color);
+ break;
+ }
+
+ /*
+ * Need to flush more colors. Make the next flusher
+ * the new first flusher and arm cwqs.
+ */
+ BUG_ON(wq->flush_color == wq->work_color);
+ BUG_ON(wq->flush_color != next->flush_color);
+
+ list_del_init(&next->list);
+ wq->first_flusher = next;
+
+ if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+ break;
+
+ /*
+ * Meh... this color is already done, clear first
+ * flusher and repeat cascading.
+ */
+ wq->first_flusher = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&wq->flush_mutex);
}
EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
*/
int flush_work(struct work_struct *work)
{
+ struct worker *worker = NULL;
+ struct global_cwq *gcwq;
struct cpu_workqueue_struct *cwq;
- struct list_head *prev;
struct wq_barrier barr;
might_sleep();
- cwq = get_wq_data(work);
- if (!cwq)
+ gcwq = get_work_gcwq(work);
+ if (!gcwq)
return 0;
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
-
- prev = NULL;
- spin_lock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
if (!list_empty(&work->entry)) {
/*
* See the comment near try_to_grab_pending()->smp_rmb().
- * If it was re-queued under us we are not going to wait.
+ * If it was re-queued to a different gcwq under us, we
+ * are not going to wait.
*/
smp_rmb();
- if (unlikely(cwq != get_wq_data(work)))
- goto out;
- prev = &work->entry;
+ cwq = get_work_cwq(work);
+ if (unlikely(!cwq || gcwq != cwq->gcwq))
+ goto already_gone;
} else {
- if (cwq->current_work != work)
- goto out;
- prev = &cwq->worklist;
+ worker = find_worker_executing_work(gcwq, work);
+ if (!worker)
+ goto already_gone;
+ cwq = worker->current_cwq;
}
- insert_wq_barrier(cwq, &barr, prev->next);
-out:
- spin_unlock_irq(&cwq->lock);
- if (!prev)
- return 0;
+
+ insert_wq_barrier(cwq, &barr, work, worker);
+ spin_unlock_irq(&gcwq->lock);
+
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return 1;
+already_gone:
+ spin_unlock_irq(&gcwq->lock);
+ return 0;
}
EXPORT_SYMBOL_GPL(flush_work);
@@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work);
*/
static int try_to_grab_pending(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
+ struct global_cwq *gcwq;
int ret = -1;
- if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
return 0;
/*
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
*/
-
- cwq = get_wq_data(work);
- if (!cwq)
+ gcwq = get_work_gcwq(work);
+ if (!gcwq)
return ret;
- spin_lock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
if (!list_empty(&work->entry)) {
/*
- * This work is queued, but perhaps we locked the wrong cwq.
+ * This work is queued, but perhaps we locked the wrong gcwq.
* In that case we must see the new value after rmb(), see
* insert_work()->wmb().
*/
smp_rmb();
- if (cwq == get_wq_data(work)) {
+ if (gcwq == get_work_gcwq(work)) {
debug_work_deactivate(work);
list_del_init(&work->entry);
+ cwq_dec_nr_in_flight(get_work_cwq(work),
+ get_work_color(work));
ret = 1;
}
}
- spin_unlock_irq(&cwq->lock);
+ spin_unlock_irq(&gcwq->lock);
return ret;
}
-static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
- struct work_struct *work)
+static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
{
struct wq_barrier barr;
- int running = 0;
+ struct worker *worker;
- spin_lock_irq(&cwq->lock);
- if (unlikely(cwq->current_work == work)) {
- insert_wq_barrier(cwq, &barr, cwq->worklist.next);
- running = 1;
- }
- spin_unlock_irq(&cwq->lock);
+ spin_lock_irq(&gcwq->lock);
+
+ worker = find_worker_executing_work(gcwq, work);
+ if (unlikely(worker))
+ insert_wq_barrier(worker->current_cwq, &barr, work, worker);
- if (unlikely(running)) {
+ spin_unlock_irq(&gcwq->lock);
+
+ if (unlikely(worker)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
}
@@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
static void wait_on_work(struct work_struct *work)
{
- struct cpu_workqueue_struct *cwq;
- struct workqueue_struct *wq;
- const struct cpumask *cpu_map;
int cpu;
might_sleep();
@@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work)
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
- cwq = get_wq_data(work);
- if (!cwq)
- return;
-
- wq = cwq->wq;
- cpu_map = wq_cpu_map(wq);
-
- for_each_cpu(cpu, cpu_map)
- wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+ for_each_gcwq_cpu(cpu)
+ wait_on_cpu_work(get_gcwq(cpu), work);
}
static int __cancel_work_timer(struct work_struct *work,
@@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work,
wait_on_work(work);
} while (unlikely(ret < 0));
- clear_wq_data(work);
+ clear_work_data(work);
return ret;
}
@@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork)
}
EXPORT_SYMBOL(cancel_delayed_work_sync);
-static struct workqueue_struct *keventd_wq __read_mostly;
-
/**
* schedule_work - put work task in global workqueue
* @work: job to be done
@@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
*/
int schedule_work(struct work_struct *work)
{
- return queue_work(keventd_wq, work);
+ return queue_work(system_wq, work);
}
EXPORT_SYMBOL(schedule_work);
@@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work);
*/
int schedule_work_on(int cpu, struct work_struct *work)
{
- return queue_work_on(cpu, keventd_wq, work);
+ return queue_work_on(cpu, system_wq, work);
}
EXPORT_SYMBOL(schedule_work_on);
@@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on);
int schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
- return queue_delayed_work(keventd_wq, dwork, delay);
+ return queue_delayed_work(system_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work);
@@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work);
void flush_delayed_work(struct delayed_work *dwork)
{
if (del_timer_sync(&dwork->timer)) {
- struct cpu_workqueue_struct *cwq;
- cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
- __queue_work(cwq, &dwork->work);
+ __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
+ &dwork->work);
put_cpu();
}
flush_work(&dwork->work);
@@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work);
int schedule_delayed_work_on(int cpu,
struct delayed_work *dwork, unsigned long delay)
{
- return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
+ return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}
EXPORT_SYMBOL(schedule_delayed_work_on);
@@ -820,8 +2568,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
int schedule_on_each_cpu(work_func_t func)
{
int cpu;
- int orig = -1;
- struct work_struct *works;
+ struct work_struct __percpu *works;
works = alloc_percpu(struct work_struct);
if (!works)
@@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func)
get_online_cpus();
- /*
- * When running in keventd don't schedule a work item on
- * itself. Can just call directly because the work queue is
- * already bound. This also is faster.
- */
- if (current_is_keventd())
- orig = raw_smp_processor_id();
-
for_each_online_cpu(cpu) {
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, func);
- if (cpu != orig)
- schedule_work_on(cpu, work);
+ schedule_work_on(cpu, work);
}
- if (orig >= 0)
- func(per_cpu_ptr(works, orig));
for_each_online_cpu(cpu)
flush_work(per_cpu_ptr(works, cpu));
@@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func)
*/
void flush_scheduled_work(void)
{
- flush_workqueue(keventd_wq);
+ flush_workqueue(system_wq);
}
EXPORT_SYMBOL(flush_scheduled_work);
@@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
int keventd_up(void)
{
- return keventd_wq != NULL;
+ return system_wq != NULL;
}
-int current_is_keventd(void)
+static int alloc_cwqs(struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq;
- int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
- int ret = 0;
-
- BUG_ON(!keventd_wq);
+ /*
+ * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+ * Make sure that the alignment isn't lower than that of
+ * unsigned long long.
+ */
+ const size_t size = sizeof(struct cpu_workqueue_struct);
+ const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+ __alignof__(unsigned long long));
+#ifdef CONFIG_SMP
+ bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+ bool percpu = false;
+#endif
- cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
- if (current == cwq->thread)
- ret = 1;
+ if (percpu)
+ wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+ else {
+ void *ptr;
- return ret;
+ /*
+ * Allocate enough room to align cwq and put an extra
+ * pointer at the end pointing back to the originally
+ * allocated pointer which will be used for free.
+ */
+ ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+ if (ptr) {
+ wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+ *(void **)(wq->cpu_wq.single + 1) = ptr;
+ }
+ }
+ /* just in case, make sure it's actually aligned */
+ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+ return wq->cpu_wq.v ? 0 : -ENOMEM;
}
-static struct cpu_workqueue_struct *
-init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
+static void free_cwqs(struct workqueue_struct *wq)
{
- struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
- cwq->wq = wq;
- spin_lock_init(&cwq->lock);
- INIT_LIST_HEAD(&cwq->worklist);
- init_waitqueue_head(&cwq->more_work);
+#ifdef CONFIG_SMP
+ bool percpu = !(wq->flags & WQ_UNBOUND);
+#else
+ bool percpu = false;
+#endif
- return cwq;
+ if (percpu)
+ free_percpu(wq->cpu_wq.pcpu);
+ else if (wq->cpu_wq.single) {
+ /* the pointer to free is stored right after the cwq */
+ kfree(*(void **)(wq->cpu_wq.single + 1));
+ }
}
-static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+static int wq_clamp_max_active(int max_active, unsigned int flags,
+ const char *name)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- struct workqueue_struct *wq = cwq->wq;
- const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
- struct task_struct *p;
+ int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
- p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
- /*
- * Nobody can add the work_struct to this cwq,
- * if (caller is __create_workqueue)
- * nobody should see this wq
- * else // caller is CPU_UP_PREPARE
- * cpu is not on cpu_online_map
- * so we can abort safely.
- */
- if (IS_ERR(p))
- return PTR_ERR(p);
- if (cwq->wq->rt)
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- cwq->thread = p;
+ if (max_active < 1 || max_active > lim)
+ printk(KERN_WARNING "workqueue: max_active %d requested for %s "
+ "is out of range, clamping between %d and %d\n",
+ max_active, name, 1, lim);
- trace_workqueue_creation(cwq->thread, cpu);
-
- return 0;
+ return clamp_val(max_active, 1, lim);
}
-static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+struct workqueue_struct *__alloc_workqueue_key(const char *name,
+ unsigned int flags,
+ int max_active,
+ struct lock_class_key *key,
+ const char *lock_name)
{
- struct task_struct *p = cwq->thread;
+ struct workqueue_struct *wq;
+ unsigned int cpu;
- if (p != NULL) {
- if (cpu >= 0)
- kthread_bind(p, cpu);
- wake_up_process(p);
- }
-}
+ /*
+ * Unbound workqueues aren't concurrency managed and should be
+ * dispatched to workers immediately.
+ */
+ if (flags & WQ_UNBOUND)
+ flags |= WQ_HIGHPRI;
-struct workqueue_struct *__create_workqueue_key(const char *name,
- int singlethread,
- int freezeable,
- int rt,
- struct lock_class_key *key,
- const char *lock_name)
-{
- struct workqueue_struct *wq;
- struct cpu_workqueue_struct *cwq;
- int err = 0, cpu;
+ max_active = max_active ?: WQ_DFL_ACTIVE;
+ max_active = wq_clamp_max_active(max_active, flags, name);
wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
- return NULL;
+ goto err;
- wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
- if (!wq->cpu_wq) {
- kfree(wq);
- return NULL;
- }
+ wq->flags = flags;
+ wq->saved_max_active = max_active;
+ mutex_init(&wq->flush_mutex);
+ atomic_set(&wq->nr_cwqs_to_flush, 0);
+ INIT_LIST_HEAD(&wq->flusher_queue);
+ INIT_LIST_HEAD(&wq->flusher_overflow);
wq->name = name;
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
- wq->singlethread = singlethread;
- wq->freezeable = freezeable;
- wq->rt = rt;
INIT_LIST_HEAD(&wq->list);
- if (singlethread) {
- cwq = init_cpu_workqueue(wq, singlethread_cpu);
- err = create_workqueue_thread(cwq, singlethread_cpu);
- start_workqueue_thread(cwq, -1);
- } else {
- cpu_maps_update_begin();
- /*
- * We must place this wq on list even if the code below fails.
- * cpu_down(cpu) can remove cpu from cpu_populated_map before
- * destroy_workqueue() takes the lock, in that case we leak
- * cwq[cpu]->thread.
- */
- spin_lock(&workqueue_lock);
- list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
- /*
- * We must initialize cwqs for each possible cpu even if we
- * are going to call destroy_workqueue() finally. Otherwise
- * cpu_up() can hit the uninitialized cwq once we drop the
- * lock.
- */
- for_each_possible_cpu(cpu) {
- cwq = init_cpu_workqueue(wq, cpu);
- if (err || !cpu_online(cpu))
- continue;
- err = create_workqueue_thread(cwq, cpu);
- start_workqueue_thread(cwq, cpu);
- }
- cpu_maps_update_done();
+ if (alloc_cwqs(wq) < 0)
+ goto err;
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
+ cwq->gcwq = gcwq;
+ cwq->wq = wq;
+ cwq->flush_color = -1;
+ cwq->max_active = max_active;
+ INIT_LIST_HEAD(&cwq->delayed_works);
}
- if (err) {
- destroy_workqueue(wq);
- wq = NULL;
+ if (flags & WQ_RESCUER) {
+ struct worker *rescuer;
+
+ if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
+ goto err;
+
+ wq->rescuer = rescuer = alloc_worker();
+ if (!rescuer)
+ goto err;
+
+ rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+ if (IS_ERR(rescuer->task))
+ goto err;
+
+ wq->rescuer = rescuer;
+ rescuer->task->flags |= PF_THREAD_BOUND;
+ wake_up_process(rescuer->task);
}
- return wq;
-}
-EXPORT_SYMBOL_GPL(__create_workqueue_key);
-static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
-{
/*
- * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
- * cpu_add_remove_lock protects cwq->thread.
+ * workqueue_lock protects global freeze state and workqueues
+ * list. Grab it, set max_active accordingly and add the new
+ * workqueue to workqueues list.
*/
- if (cwq->thread == NULL)
- return;
+ spin_lock(&workqueue_lock);
- lock_map_acquire(&cwq->wq->lockdep_map);
- lock_map_release(&cwq->wq->lockdep_map);
+ if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+ for_each_cwq_cpu(cpu, wq)
+ get_cwq(cpu, wq)->max_active = 0;
- flush_cpu_workqueue(cwq);
- /*
- * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
- * a concurrent flush_workqueue() can insert a barrier after us.
- * However, in that case run_workqueue() won't return and check
- * kthread_should_stop() until it flushes all work_struct's.
- * When ->worklist becomes empty it is safe to exit because no
- * more work_structs can be queued on this cwq: flush_workqueue
- * checks list_empty(), and a "normal" queue_work() can't use
- * a dead CPU.
- */
- trace_workqueue_destruction(cwq->thread);
- kthread_stop(cwq->thread);
- cwq->thread = NULL;
+ list_add(&wq->list, &workqueues);
+
+ spin_unlock(&workqueue_lock);
+
+ return wq;
+err:
+ if (wq) {
+ free_cwqs(wq);
+ free_mayday_mask(wq->mayday_mask);
+ kfree(wq->rescuer);
+ kfree(wq);
+ }
+ return NULL;
}
+EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
/**
* destroy_workqueue - safely terminate a workqueue
@@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- const struct cpumask *cpu_map = wq_cpu_map(wq);
- int cpu;
+ unsigned int cpu;
- cpu_maps_update_begin();
+ flush_workqueue(wq);
+
+ /*
+ * wq list is used to freeze wq, remove from list after
+ * flushing is complete in case freeze races us.
+ */
spin_lock(&workqueue_lock);
list_del(&wq->list);
spin_unlock(&workqueue_lock);
- for_each_cpu(cpu, cpu_map)
- cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
- cpu_maps_update_done();
+ /* sanity check */
+ for_each_cwq_cpu(cpu, wq) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++)
+ BUG_ON(cwq->nr_in_flight[i]);
+ BUG_ON(cwq->nr_active);
+ BUG_ON(!list_empty(&cwq->delayed_works));
+ }
+
+ if (wq->flags & WQ_RESCUER) {
+ kthread_stop(wq->rescuer->task);
+ free_mayday_mask(wq->mayday_mask);
+ }
- free_percpu(wq->cpu_wq);
+ free_cwqs(wq);
kfree(wq);
}
EXPORT_SYMBOL_GPL(destroy_workqueue);
+/**
+ * workqueue_set_max_active - adjust max_active of a workqueue
+ * @wq: target workqueue
+ * @max_active: new max_active value.
+ *
+ * Set max_active of @wq to @max_active.
+ *
+ * CONTEXT:
+ * Don't call from IRQ context.
+ */
+void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
+{
+ unsigned int cpu;
+
+ max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
+
+ spin_lock(&workqueue_lock);
+
+ wq->saved_max_active = max_active;
+
+ for_each_cwq_cpu(cpu, wq) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ spin_lock_irq(&gcwq->lock);
+
+ if (!(wq->flags & WQ_FREEZEABLE) ||
+ !(gcwq->flags & GCWQ_FREEZING))
+ get_cwq(gcwq->cpu, wq)->max_active = max_active;
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ spin_unlock(&workqueue_lock);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
+
+/**
+ * workqueue_congested - test whether a workqueue is congested
+ * @cpu: CPU in question
+ * @wq: target workqueue
+ *
+ * Test whether @wq's cpu workqueue for @cpu is congested. There is
+ * no synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ *
+ * RETURNS:
+ * %true if congested, %false otherwise.
+ */
+bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+{
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ return !list_empty(&cwq->delayed_works);
+}
+EXPORT_SYMBOL_GPL(workqueue_congested);
+
+/**
+ * work_cpu - return the last known associated cpu for @work
+ * @work: the work of interest
+ *
+ * RETURNS:
+ * CPU number if @work was ever queued. WORK_CPU_NONE otherwise.
+ */
+unsigned int work_cpu(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_work_gcwq(work);
+
+ return gcwq ? gcwq->cpu : WORK_CPU_NONE;
+}
+EXPORT_SYMBOL_GPL(work_cpu);
+
+/**
+ * work_busy - test whether a work is currently pending or running
+ * @work: the work to be tested
+ *
+ * Test whether @work is currently pending or running. There is no
+ * synchronization around this function and the test result is
+ * unreliable and only useful as advisory hints or for debugging.
+ * Especially for reentrant wqs, the pending state might hide the
+ * running state.
+ *
+ * RETURNS:
+ * OR'd bitmask of WORK_BUSY_* bits.
+ */
+unsigned int work_busy(struct work_struct *work)
+{
+ struct global_cwq *gcwq = get_work_gcwq(work);
+ unsigned long flags;
+ unsigned int ret = 0;
+
+ if (!gcwq)
+ return false;
+
+ spin_lock_irqsave(&gcwq->lock, flags);
+
+ if (work_pending(work))
+ ret |= WORK_BUSY_PENDING;
+ if (find_worker_executing_work(gcwq, work))
+ ret |= WORK_BUSY_RUNNING;
+
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_busy);
+
+/*
+ * CPU hotplug.
+ *
+ * There are two challenges in supporting CPU hotplug. Firstly, there
+ * are a lot of assumptions on strong associations among work, cwq and
+ * gcwq which make migrating pending and scheduled works very
+ * difficult to implement without impacting hot paths. Secondly,
+ * gcwqs serve mix of short, long and very long running works making
+ * blocked draining impractical.
+ *
+ * This is solved by allowing a gcwq to be detached from CPU, running
+ * it with unbound (rogue) workers and allowing it to be reattached
+ * later if the cpu comes back online. A separate thread is created
+ * to govern a gcwq in such state and is called the trustee of the
+ * gcwq.
+ *
+ * Trustee states and their descriptions.
+ *
+ * START Command state used on startup. On CPU_DOWN_PREPARE, a
+ * new trustee is started with this state.
+ *
+ * IN_CHARGE Once started, trustee will enter this state after
+ * assuming the manager role and making all existing
+ * workers rogue. DOWN_PREPARE waits for trustee to
+ * enter this state. After reaching IN_CHARGE, trustee
+ * tries to execute the pending worklist until it's empty
+ * and the state is set to BUTCHER, or the state is set
+ * to RELEASE.
+ *
+ * BUTCHER Command state which is set by the cpu callback after
+ * the cpu has went down. Once this state is set trustee
+ * knows that there will be no new works on the worklist
+ * and once the worklist is empty it can proceed to
+ * killing idle workers.
+ *
+ * RELEASE Command state which is set by the cpu callback if the
+ * cpu down has been canceled or it has come online
+ * again. After recognizing this state, trustee stops
+ * trying to drain or butcher and clears ROGUE, rebinds
+ * all remaining workers back to the cpu and releases
+ * manager role.
+ *
+ * DONE Trustee will enter this state after BUTCHER or RELEASE
+ * is complete.
+ *
+ * trustee CPU draining
+ * took over down complete
+ * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
+ * | | ^
+ * | CPU is back online v return workers |
+ * ----------------> RELEASE --------------
+ */
+
+/**
+ * trustee_wait_event_timeout - timed event wait for trustee
+ * @cond: condition to wait for
+ * @timeout: timeout in jiffies
+ *
+ * wait_event_timeout() for trustee to use. Handles locking and
+ * checks for RELEASE request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by trustee.
+ *
+ * RETURNS:
+ * Positive indicating left time if @cond is satisfied, 0 if timed
+ * out, -1 if canceled.
+ */
+#define trustee_wait_event_timeout(cond, timeout) ({ \
+ long __ret = (timeout); \
+ while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
+ __ret) { \
+ spin_unlock_irq(&gcwq->lock); \
+ __wait_event_timeout(gcwq->trustee_wait, (cond) || \
+ (gcwq->trustee_state == TRUSTEE_RELEASE), \
+ __ret); \
+ spin_lock_irq(&gcwq->lock); \
+ } \
+ gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
+})
+
+/**
+ * trustee_wait_event - event wait for trustee
+ * @cond: condition to wait for
+ *
+ * wait_event() for trustee to use. Automatically handles locking and
+ * checks for CANCEL request.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by trustee.
+ *
+ * RETURNS:
+ * 0 if @cond is satisfied, -1 if canceled.
+ */
+#define trustee_wait_event(cond) ({ \
+ long __ret1; \
+ __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
+ __ret1 < 0 ? -1 : 0; \
+})
+
+static int __cpuinit trustee_thread(void *__gcwq)
+{
+ struct global_cwq *gcwq = __gcwq;
+ struct worker *worker;
+ struct work_struct *work;
+ struct hlist_node *pos;
+ long rc;
+ int i;
+
+ BUG_ON(gcwq->cpu != smp_processor_id());
+
+ spin_lock_irq(&gcwq->lock);
+ /*
+ * Claim the manager position and make all workers rogue.
+ * Trustee must be bound to the target cpu and can't be
+ * cancelled.
+ */
+ BUG_ON(gcwq->cpu != smp_processor_id());
+ rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
+ BUG_ON(rc < 0);
+
+ gcwq->flags |= GCWQ_MANAGING_WORKERS;
+
+ list_for_each_entry(worker, &gcwq->idle_list, entry)
+ worker->flags |= WORKER_ROGUE;
+
+ for_each_busy_worker(worker, i, pos, gcwq)
+ worker->flags |= WORKER_ROGUE;
+
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the rogue flag. This is
+ * necessary as scheduler callbacks may be invoked from other
+ * cpus.
+ */
+ spin_unlock_irq(&gcwq->lock);
+ schedule();
+ spin_lock_irq(&gcwq->lock);
+
+ /*
+ * Sched callbacks are disabled now. Zap nr_running. After
+ * this, nr_running stays zero and need_more_worker() and
+ * keep_working() are always true as long as the worklist is
+ * not empty.
+ */
+ atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+
+ spin_unlock_irq(&gcwq->lock);
+ del_timer_sync(&gcwq->idle_timer);
+ spin_lock_irq(&gcwq->lock);
+
+ /*
+ * We're now in charge. Notify and proceed to drain. We need
+ * to keep the gcwq running during the whole CPU down
+ * procedure as other cpu hotunplug callbacks may need to
+ * flush currently running tasks.
+ */
+ gcwq->trustee_state = TRUSTEE_IN_CHARGE;
+ wake_up_all(&gcwq->trustee_wait);
+
+ /*
+ * The original cpu is in the process of dying and may go away
+ * anytime now. When that happens, we and all workers would
+ * be migrated to other cpus. Try draining any left work. We
+ * want to get it over with ASAP - spam rescuers, wake up as
+ * many idlers as necessary and create new ones till the
+ * worklist is empty. Note that if the gcwq is frozen, there
+ * may be frozen works in freezeable cwqs. Don't declare
+ * completion while frozen.
+ */
+ while (gcwq->nr_workers != gcwq->nr_idle ||
+ gcwq->flags & GCWQ_FREEZING ||
+ gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+ int nr_works = 0;
+
+ list_for_each_entry(work, &gcwq->worklist, entry) {
+ send_mayday(work);
+ nr_works++;
+ }
+
+ list_for_each_entry(worker, &gcwq->idle_list, entry) {
+ if (!nr_works--)
+ break;
+ wake_up_process(worker->task);
+ }
+
+ if (need_to_create_worker(gcwq)) {
+ spin_unlock_irq(&gcwq->lock);
+ worker = create_worker(gcwq, false);
+ spin_lock_irq(&gcwq->lock);
+ if (worker) {
+ worker->flags |= WORKER_ROGUE;
+ start_worker(worker);
+ }
+ }
+
+ /* give a breather */
+ if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
+ break;
+ }
+
+ /*
+ * Either all works have been scheduled and cpu is down, or
+ * cpu down has already been canceled. Wait for and butcher
+ * all workers till we're canceled.
+ */
+ do {
+ rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
+ while (!list_empty(&gcwq->idle_list))
+ destroy_worker(list_first_entry(&gcwq->idle_list,
+ struct worker, entry));
+ } while (gcwq->nr_workers && rc >= 0);
+
+ /*
+ * At this point, either draining has completed and no worker
+ * is left, or cpu down has been canceled or the cpu is being
+ * brought back up. There shouldn't be any idle one left.
+ * Tell the remaining busy ones to rebind once it finishes the
+ * currently scheduled works by scheduling the rebind_work.
+ */
+ WARN_ON(!list_empty(&gcwq->idle_list));
+
+ for_each_busy_worker(worker, i, pos, gcwq) {
+ struct work_struct *rebind_work = &worker->rebind_work;
+
+ /*
+ * Rebind_work may race with future cpu hotplug
+ * operations. Use a separate flag to mark that
+ * rebinding is scheduled.
+ */
+ worker->flags |= WORKER_REBIND;
+ worker->flags &= ~WORKER_ROGUE;
+
+ /* queue rebind_work, wq doesn't matter, use the default one */
+ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+ work_data_bits(rebind_work)))
+ continue;
+
+ debug_work_activate(rebind_work);
+ insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+ worker->scheduled.next,
+ work_color_to_flags(WORK_NO_COLOR));
+ }
+
+ /* relinquish manager role */
+ gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+
+ /* notify completion */
+ gcwq->trustee = NULL;
+ gcwq->trustee_state = TRUSTEE_DONE;
+ wake_up_all(&gcwq->trustee_wait);
+ spin_unlock_irq(&gcwq->lock);
+ return 0;
+}
+
+/**
+ * wait_trustee_state - wait for trustee to enter the specified state
+ * @gcwq: gcwq the trustee of interest belongs to
+ * @state: target state to wait for
+ *
+ * Wait for the trustee to reach @state. DONE is already matched.
+ *
+ * CONTEXT:
+ * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * multiple times. To be used by cpu_callback.
+ */
+static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+{
+ if (!(gcwq->trustee_state == state ||
+ gcwq->trustee_state == TRUSTEE_DONE)) {
+ spin_unlock_irq(&gcwq->lock);
+ __wait_event(gcwq->trustee_wait,
+ gcwq->trustee_state == state ||
+ gcwq->trustee_state == TRUSTEE_DONE);
+ spin_lock_irq(&gcwq->lock);
+ }
+}
+
static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct cpu_workqueue_struct *cwq;
- struct workqueue_struct *wq;
- int err = 0;
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct task_struct *new_trustee = NULL;
+ struct worker *uninitialized_var(new_worker);
+ unsigned long flags;
action &= ~CPU_TASKS_FROZEN;
switch (action) {
+ case CPU_DOWN_PREPARE:
+ new_trustee = kthread_create(trustee_thread, gcwq,
+ "workqueue_trustee/%d\n", cpu);
+ if (IS_ERR(new_trustee))
+ return notifier_from_errno(PTR_ERR(new_trustee));
+ kthread_bind(new_trustee, cpu);
+ /* fall through */
case CPU_UP_PREPARE:
- cpumask_set_cpu(cpu, cpu_populated_map);
- }
-undo:
- list_for_each_entry(wq, &workqueues, list) {
- cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = create_workqueue_thread(cwq, cpu);
- if (!err)
- break;
- printk(KERN_ERR "workqueue [%s] for %i failed\n",
- wq->name, cpu);
- action = CPU_UP_CANCELED;
- err = -ENOMEM;
- goto undo;
-
- case CPU_ONLINE:
- start_workqueue_thread(cwq, cpu);
- break;
-
- case CPU_UP_CANCELED:
- start_workqueue_thread(cwq, -1);
- case CPU_POST_DEAD:
- cleanup_workqueue_thread(cwq);
- break;
+ BUG_ON(gcwq->first_idle);
+ new_worker = create_worker(gcwq, false);
+ if (!new_worker) {
+ if (new_trustee)
+ kthread_stop(new_trustee);
+ return NOTIFY_BAD;
}
}
+ /* some are called w/ irq disabled, don't disturb irq status */
+ spin_lock_irqsave(&gcwq->lock, flags);
+
switch (action) {
- case CPU_UP_CANCELED:
+ case CPU_DOWN_PREPARE:
+ /* initialize trustee and tell it to acquire the gcwq */
+ BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
+ gcwq->trustee = new_trustee;
+ gcwq->trustee_state = TRUSTEE_START;
+ wake_up_process(gcwq->trustee);
+ wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
+ /* fall through */
+ case CPU_UP_PREPARE:
+ BUG_ON(gcwq->first_idle);
+ gcwq->first_idle = new_worker;
+ break;
+
+ case CPU_DYING:
+ /*
+ * Before this, the trustee and all workers except for
+ * the ones which are still executing works from
+ * before the last CPU down must be on the cpu. After
+ * this, they'll all be diasporas.
+ */
+ gcwq->flags |= GCWQ_DISASSOCIATED;
+ break;
+
case CPU_POST_DEAD:
- cpumask_clear_cpu(cpu, cpu_populated_map);
+ gcwq->trustee_state = TRUSTEE_BUTCHER;
+ /* fall through */
+ case CPU_UP_CANCELED:
+ destroy_worker(gcwq->first_idle);
+ gcwq->first_idle = NULL;
+ break;
+
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ gcwq->flags &= ~GCWQ_DISASSOCIATED;
+ if (gcwq->trustee_state != TRUSTEE_DONE) {
+ gcwq->trustee_state = TRUSTEE_RELEASE;
+ wake_up_process(gcwq->trustee);
+ wait_trustee_state(gcwq, TRUSTEE_DONE);
+ }
+
+ /*
+ * Trustee is done and there might be no worker left.
+ * Put the first_idle in and request a real manager to
+ * take a look.
+ */
+ spin_unlock_irq(&gcwq->lock);
+ kthread_bind(gcwq->first_idle->task, cpu);
+ spin_lock_irq(&gcwq->lock);
+ gcwq->flags |= GCWQ_MANAGE_WORKERS;
+ start_worker(gcwq->first_idle);
+ gcwq->first_idle = NULL;
+ break;
}
- return notifier_from_errno(err);
+ spin_unlock_irqrestore(&gcwq->lock, flags);
+
+ return notifier_from_errno(0);
}
#ifdef CONFIG_SMP
@@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
EXPORT_SYMBOL_GPL(work_on_cpu);
#endif /* CONFIG_SMP */
-void __init init_workqueues(void)
+#ifdef CONFIG_FREEZER
+
+/**
+ * freeze_workqueues_begin - begin freezing workqueues
+ *
+ * Start freezing workqueues. After this function returns, all
+ * freezeable workqueues will queue new works to their frozen_works
+ * list instead of gcwq->worklist.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void freeze_workqueues_begin(void)
{
- alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
+ unsigned int cpu;
- cpumask_copy(cpu_populated_map, cpu_online_mask);
- singlethread_cpu = cpumask_first(cpu_possible_mask);
- cpu_singlethread_map = cpumask_of(singlethread_cpu);
- hotcpu_notifier(workqueue_cpu_callback, 0);
- keventd_wq = create_workqueue("events");
- BUG_ON(!keventd_wq);
+ spin_lock(&workqueue_lock);
+
+ BUG_ON(workqueue_freezing);
+ workqueue_freezing = true;
+
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct workqueue_struct *wq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ BUG_ON(gcwq->flags & GCWQ_FREEZING);
+ gcwq->flags |= GCWQ_FREEZING;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (cwq && wq->flags & WQ_FREEZEABLE)
+ cwq->max_active = 0;
+ }
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ spin_unlock(&workqueue_lock);
+}
+
+/**
+ * freeze_workqueues_busy - are freezeable workqueues still busy?
+ *
+ * Check whether freezing is complete. This function must be called
+ * between freeze_workqueues_begin() and thaw_workqueues().
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock.
+ *
+ * RETURNS:
+ * %true if some freezeable workqueues are still busy. %false if
+ * freezing is complete.
+ */
+bool freeze_workqueues_busy(void)
+{
+ unsigned int cpu;
+ bool busy = false;
+
+ spin_lock(&workqueue_lock);
+
+ BUG_ON(!workqueue_freezing);
+
+ for_each_gcwq_cpu(cpu) {
+ struct workqueue_struct *wq;
+ /*
+ * nr_active is monotonically decreasing. It's safe
+ * to peek without lock.
+ */
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ continue;
+
+ BUG_ON(cwq->nr_active < 0);
+ if (cwq->nr_active) {
+ busy = true;
+ goto out_unlock;
+ }
+ }
+ }
+out_unlock:
+ spin_unlock(&workqueue_lock);
+ return busy;
+}
+
+/**
+ * thaw_workqueues - thaw workqueues
+ *
+ * Thaw workqueues. Normal queueing is restored and all collected
+ * frozen works are transferred to their respective gcwq worklists.
+ *
+ * CONTEXT:
+ * Grabs and releases workqueue_lock and gcwq->lock's.
+ */
+void thaw_workqueues(void)
+{
+ unsigned int cpu;
+
+ spin_lock(&workqueue_lock);
+
+ if (!workqueue_freezing)
+ goto out_unlock;
+
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct workqueue_struct *wq;
+
+ spin_lock_irq(&gcwq->lock);
+
+ BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+ gcwq->flags &= ~GCWQ_FREEZING;
+
+ list_for_each_entry(wq, &workqueues, list) {
+ struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+
+ if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ continue;
+
+ /* restore max_active and repopulate worklist */
+ cwq->max_active = wq->saved_max_active;
+
+ while (!list_empty(&cwq->delayed_works) &&
+ cwq->nr_active < cwq->max_active)
+ cwq_activate_first_delayed(cwq);
+ }
+
+ wake_up_worker(gcwq);
+
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ workqueue_freezing = false;
+out_unlock:
+ spin_unlock(&workqueue_lock);
+}
+#endif /* CONFIG_FREEZER */
+
+static int __init init_workqueues(void)
+{
+ unsigned int cpu;
+ int i;
+
+ cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+
+ /* initialize gcwqs */
+ for_each_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+
+ spin_lock_init(&gcwq->lock);
+ INIT_LIST_HEAD(&gcwq->worklist);
+ gcwq->cpu = cpu;
+ if (cpu == WORK_CPU_UNBOUND)
+ gcwq->flags |= GCWQ_DISASSOCIATED;
+
+ INIT_LIST_HEAD(&gcwq->idle_list);
+ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
+
+ init_timer_deferrable(&gcwq->idle_timer);
+ gcwq->idle_timer.function = idle_worker_timeout;
+ gcwq->idle_timer.data = (unsigned long)gcwq;
+
+ setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+ (unsigned long)gcwq);
+
+ ida_init(&gcwq->worker_ida);
+
+ gcwq->trustee_state = TRUSTEE_DONE;
+ init_waitqueue_head(&gcwq->trustee_wait);
+ }
+
+ /* create the initial worker */
+ for_each_online_gcwq_cpu(cpu) {
+ struct global_cwq *gcwq = get_gcwq(cpu);
+ struct worker *worker;
+
+ worker = create_worker(gcwq, true);
+ BUG_ON(!worker);
+ spin_lock_irq(&gcwq->lock);
+ start_worker(worker);
+ spin_unlock_irq(&gcwq->lock);
+ }
+
+ system_wq = alloc_workqueue("events", 0, 0);
+ system_long_wq = alloc_workqueue("events_long", 0, 0);
+ system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
+ WQ_UNBOUND_MAX_ACTIVE);
+ BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq);
+ return 0;
}
+early_initcall(init_workqueues);
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..2d10fc98dc79
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,9 @@
+/*
+ * kernel/workqueue_sched.h
+ *
+ * Scheduler hooks for concurrency managed workqueue. Only to be
+ * included from sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+ unsigned int cpu);