summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile47
-rw-r--r--kernel/acct.c15
-rw-r--r--kernel/async.c179
-rw-r--r--kernel/audit.c22
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c1114
-rw-r--r--kernel/compat.c158
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/context_tracking.c114
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c116
-rw-r--r--kernel/cpuset.c992
-rw-r--r--kernel/debug/debug_core.c3
-rw-r--r--kernel/debug/debug_core.h2
-rw-r--r--kernel/debug/gdbstub.c4
-rw-r--r--kernel/debug/kdb/kdb_bp.c20
-rw-r--r--kernel/debug/kdb/kdb_debugger.c25
-rw-r--r--kernel/debug/kdb/kdb_main.c135
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/delayacct.c7
-rw-r--r--kernel/events/core.c110
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c36
-rw-r--r--kernel/events/uprobes.c746
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c32
-rw-r--r--kernel/futex.c51
-rw-r--r--kernel/futex_compat.c21
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c71
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/irqdomain.c20
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c20
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/irq_work.c150
-rw-r--r--kernel/kallsyms.c26
-rw-r--r--kernel/kexec.c227
-rw-r--r--kernel/kfifo.c609
-rw-r--r--kernel/kmod.c107
-rw-r--r--kernel/kprobes.c85
-rw-r--r--kernel/kthread.c111
-rw-r--r--kernel/lockdep.c43
-rw-r--r--kernel/modsign_certificate.S13
-rw-r--r--kernel/module.c160
-rw-r--r--kernel/mutex.c152
-rw-r--r--kernel/nsproxy.c9
-rw-r--r--kernel/panic.c40
-rw-r--r--kernel/params.c5
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c127
-rw-r--r--kernel/posix-timers.c132
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/console.c116
-rw-r--r--kernel/power/main.c29
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c4
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/suspend.c81
-rw-r--r--kernel/power/suspend_test.c11
-rw-r--r--kernel/printk.c187
-rw-r--r--kernel/profile.c30
-rw-r--r--kernel/ptrace.c87
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c60
-rw-r--r--kernel/rcutiny.c8
-rw-r--r--kernel/rcutiny_plugin.h56
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c526
-rw-r--r--kernel/rcutree.h54
-rw-r--r--kernel/rcutree_plugin.h622
-rw-r--r--kernel/rcutree_trace.c10
-rw-r--r--kernel/relay.c20
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/rtmutex.c1
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c627
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c490
-rw-r--r--kernel/sched/debug.c97
-rw-r--r--kernel/sched/fair.c185
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle_task.c17
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h246
-rw-r--r--kernel/sched/stats.c74
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/semaphore.c8
-rw-r--r--kernel/signal.c372
-rw-r--r--kernel/smp.c262
-rw-r--r--kernel/smpboot.c17
-rw-r--r--kernel/softirq.c67
-rw-r--r--kernel/srcu.c37
-rw-r--r--kernel/stop_machine.c156
-rw-r--r--kernel/sys.c596
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c42
-rw-r--r--kernel/sysctl_binary.c47
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c21
-rw-r--r--kernel/time/Kconfig89
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/ntp.c127
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/tick-broadcast.c283
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c314
-rw-r--r--kernel/time/timekeeping.c467
-rw-r--r--kernel/time/timer_list.c104
-rw-r--r--kernel/timeconst.bc108
-rw-r--r--kernel/timeconst.pl378
-rw-r--r--kernel/timer.c161
-rw-r--r--kernel/trace/Kconfig104
-rw-r--r--kernel/trace/blktrace.c9
-rw-r--r--kernel/trace/ftrace.c312
-rw-r--r--kernel/trace/power-traces.c3
-rw-r--r--kernel/trace/ring_buffer.c614
-rw-r--r--kernel/trace/trace.c2380
-rw-r--r--kernel/trace/trace.h281
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c15
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1398
-rw-r--r--kernel/trace/trace_events_filter.c34
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c266
-rw-r--r--kernel/trace/trace_functions_graph.c80
-rw-r--r--kernel/trace/trace_irqsoff.c90
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c122
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c95
-rw-r--r--kernel/trace/trace_selftest.c72
-rw-r--r--kernel/trace/trace_stack.c78
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c139
-rw-r--r--kernel/trace/trace_uprobe.c398
-rw-r--r--kernel/tracepoint.c27
-rw-r--r--kernel/tsacct.c44
-rw-r--r--kernel/uid16.c55
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c101
-rw-r--r--kernel/utsname.c4
-rw-r--r--kernel/utsname_sysctl.c3
-rw-r--r--kernel/watchdog.c16
-rw-r--r--kernel/workqueue.c3895
-rw-r--r--kernel/workqueue_internal.h72
-rw-r--r--kernel/workqueue_sched.h9
169 files changed, 16075 insertions, 8943 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c072b6da239..271fd3119af9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
rcupdate.o extable.o params.o posix-timers.o \
- kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
async.o range.o groups.o lglock.o smpboot.o
@@ -24,10 +24,9 @@ endif
obj-y += sched/
obj-y += power/
+obj-y += cpu/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
-obj-$(CONFIG_X86) += kcmp.o
-endif
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -127,11 +126,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst = TIMEC $@
- cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+quiet_cmd_hzfile = HZFILE $@
+ cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+ $(call if_changed,hzfile)
+
+quiet_cmd_bc = BC $@
+ cmd_bc = bc -q $(filter-out FORCE,$^) > $@
+
targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
- $(call if_changed,timeconst)
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
+ $(call if_changed,bc)
ifeq ($(CONFIG_MODULE_SIG),y)
#
@@ -153,23 +160,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates
# fail and that the kernel may be used afterwards.
#
###############################################################################
-sign_key_with_hash :=
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
+ifndef CONFIG_MODULE_SIG_HASH
$(error Could not determine digest type to use from kernel config)
endif
@@ -182,10 +173,10 @@ signing_key.priv signing_key.x509: x509.genkey
@echo "### needs to be run as root, and uses a hardware random"
@echo "### number generator if one is available."
@echo "###"
- openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
- -x509 -config x509.genkey \
+ openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
+ -batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
- -keyout signing_key.priv
+ -keyout signing_key.priv 2>&1
@echo "###"
@echo "### Key pair generated."
@echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
if (IS_ERR(file))
return PTR_ERR(file);
- if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+ if (!S_ISREG(file_inode(file)->i_mode)) {
filp_close(file, NULL);
return -EACCES;
}
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+ file_end_write(file);
out:
revert_creds(orig_cred);
}
@@ -566,6 +573,7 @@ out:
void acct_collect(long exitcode, int group_dead)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ cputime_t utime, stime;
unsigned long vsize = 0;
if (group_dead && current->mm) {
@@ -593,8 +601,9 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime += current->utime;
- pacct->ac_stime += current->stime;
+ task_cputime(current, &utime, &stime);
+ pacct->ac_utime += utime;
+ pacct->ac_stime += stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 6f34904a0b53..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,65 +57,48 @@ asynchronous and synchronous parts of the kernel.
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include "workqueue_internal.h"
+
static async_cookie_t next_cookie = 1;
-#define MAX_WORK 32768
+#define MAX_WORK 32768
+#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */
-static LIST_HEAD(async_pending);
-static ASYNC_DOMAIN(async_running);
-static LIST_HEAD(async_domains);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
+static ASYNC_DOMAIN(async_dfl_domain);
static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
struct async_entry {
- struct list_head list;
+ struct list_head domain_list;
+ struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
- async_func_ptr *func;
+ async_func_t func;
void *data;
- struct async_domain *running;
+ struct async_domain *domain;
};
static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t __lowest_in_progress(struct async_domain *running)
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- async_cookie_t first_running = next_cookie; /* infinity value */
- async_cookie_t first_pending = next_cookie; /* ditto */
- struct async_entry *entry;
-
- /*
- * Both running and pending lists are sorted but not disjoint.
- * Take the first cookies from both and return the min.
- */
- if (!list_empty(&running->domain)) {
- entry = list_first_entry(&running->domain, typeof(*entry), list);
- first_running = entry->cookie;
- }
+ struct list_head *pending;
+ async_cookie_t ret = ASYNC_COOKIE_MAX;
+ unsigned long flags;
- list_for_each_entry(entry, &async_pending, list) {
- if (entry->running == running) {
- first_pending = entry->cookie;
- break;
- }
- }
+ spin_lock_irqsave(&async_lock, flags);
- return min(first_running, first_pending);
-}
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
-static async_cookie_t lowest_in_progress(struct async_domain *running)
-{
- unsigned long flags;
- async_cookie_t ret;
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
- spin_lock_irqsave(&async_lock, flags);
- ret = __lowest_in_progress(running);
spin_unlock_irqrestore(&async_lock, flags);
return ret;
}
@@ -127,20 +110,10 @@ static void async_run_entry_fn(struct work_struct *work)
{
struct async_entry *entry =
container_of(work, struct async_entry, work);
- struct async_entry *pos;
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
- struct async_domain *running = entry->running;
- /* 1) move self to the running queue, make sure it stays sorted */
- spin_lock_irqsave(&async_lock, flags);
- list_for_each_entry_reverse(pos, &running->domain, list)
- if (entry->cookie < pos->cookie)
- break;
- list_move_tail(&entry->list, &pos->list);
- spin_unlock_irqrestore(&async_lock, flags);
-
- /* 2) run (and print duration) */
+ /* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
(long long)entry->cookie,
@@ -157,23 +130,22 @@ static void async_run_entry_fn(struct work_struct *work)
(long long)ktime_to_ns(delta) >> 10);
}
- /* 3) remove self from the running queue */
+ /* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
- list_del(&entry->list);
- if (running->registered && --running->count == 0)
- list_del_init(&running->node);
+ list_del_init(&entry->domain_list);
+ list_del_init(&entry->global_list);
- /* 4) free the entry */
+ /* 3) free the entry */
kfree(entry);
atomic_dec(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* 5) wake up any waiters */
+ /* 4) wake up any waiters */
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -193,19 +165,25 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
+ INIT_LIST_HEAD(&entry->domain_list);
+ INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = ptr;
+ entry->func = func;
entry->data = data;
- entry->running = running;
+ entry->domain = domain;
spin_lock_irqsave(&async_lock, flags);
+
+ /* allocate cookie and queue */
newcookie = entry->cookie = next_cookie++;
- list_add_tail(&entry->list, &async_pending);
- if (running->registered && running->count++ == 0)
- list_add_tail(&running->node, &async_domains);
+
+ list_add_tail(&entry->domain_list, &domain->pending);
+ if (domain->registered)
+ list_add_tail(&entry->global_list, &async_global_pending);
+
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
@@ -220,34 +198,34 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_running);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
*
* Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
- * to wait within a certain synchronization domain rather than globally.
- * A synchronization domain is specified via the running queue @running to use.
- * Note: This function may be called from atomic or non-atomic contexts.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally. A
+ * synchronization domain is specified via @domain. Note: This function
+ * may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
- struct async_domain *running)
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
+ struct async_domain *domain)
{
- return __async_schedule(ptr, data, running);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -258,18 +236,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- mutex_lock(&async_register_mutex);
- do {
- struct async_domain *domain = NULL;
-
- spin_lock_irq(&async_lock);
- if (!list_empty(&async_domains))
- domain = list_first_entry(&async_domains, typeof(*domain), node);
- spin_unlock_irq(&async_lock);
-
- async_synchronize_cookie_domain(next_cookie, domain);
- } while (!list_empty(&async_domains));
- mutex_unlock(&async_register_mutex);
+ async_synchronize_full_domain(NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
@@ -284,51 +251,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
*/
void async_unregister_domain(struct async_domain *domain)
{
- mutex_lock(&async_register_mutex);
spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->node) ||
- !list_empty(&domain->domain));
+ WARN_ON(!domain->registered || !list_empty(&domain->pending));
domain->registered = 0;
spin_unlock_irq(&async_lock);
- mutex_unlock(&async_register_mutex);
}
EXPORT_SYMBOL_GPL(async_unregister_domain);
/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @domain: the domain to synchronize
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by @domain have been done.
*/
void async_synchronize_full_domain(struct async_domain *domain)
{
- async_synchronize_cookie_domain(next_cookie, domain);
+ async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
*
* This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
- * prior to @cookie have been done.
+ * synchronization domain specified by @domain submitted prior to @cookie
+ * have been done.
*/
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (!running)
- return;
-
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
- wait_event(async_done, lowest_in_progress(running) >= cookie);
+ wait_event(async_done, lowest_in_progress(domain) >= cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
endtime = ktime_get();
@@ -350,6 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
*/
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_domain(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+ struct worker *worker = current_wq_worker();
+
+ return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index 5c7e62ff4795..21c7fa615bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -60,7 +60,7 @@
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
#include <linux/pid_namespace.h>
@@ -646,14 +646,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
@@ -859,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
- * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
@@ -868,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- while (NLMSG_OK(nlh, len)) {
+ while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- nlh = NLMSG_NEXT(nlh, len);
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -1667,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 45c8325de5bb..1c95131ef760 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -205,8 +205,6 @@ struct audit_context {
#endif
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
extern int audit_ever_enabled;
extern void audit_copy_inode(struct audit_names *name,
@@ -218,7 +216,6 @@ extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
extern void audit_log_name(struct audit_context *context,
struct audit_names *n, struct path *path,
int record_num, int *call_panic);
-#endif
extern int audit_pid;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bc6595fe952e..83a2970295d1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -533,6 +533,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..2a9926275f80 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
#include <linux/cred.h>
#include <linux/ctype.h>
#include <linux/errno.h>
-#include <linux/fs.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
@@ -52,14 +51,14 @@
#include <linux/module.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
#include <linux/atomic.h>
@@ -83,7 +82,13 @@
* B happens only through cgroup_show_options() and using cgroup_root_mutex
* breaks it.
*/
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
+#else
static DEFINE_MUTEX(cgroup_mutex);
+#endif
+
static DEFINE_MUTEX(cgroup_root_mutex);
/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
#include <linux/cgroup_subsys.h>
};
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
- */
-struct cgroupfs_root {
- struct super_block *sb;
-
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
- unsigned long subsys_mask;
-
- /* Unique id for this hierarchy. */
- int hierarchy_id;
-
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_mask;
-
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
-
- /* The root cgroup for this hierarchy */
- struct cgroup top_cgroup;
-
- /* Tracks how many cgroups are currently defined in hierarchy.*/
- int number_of_cgroups;
-
- /* A list running through the active hierarchies */
- struct list_head root_list;
-
- /* All cgroups on this root, cgroup_mutex protected */
- struct list_head allcg_list;
-
- /* Hierarchy-specific flags */
- unsigned long flags;
-
- /* IDs for cgroups in this hierarchy */
- struct ida cgroup_ida;
-
- /* The path to use for release notifications. */
- char release_agent_path[PATH_MAX];
-
- /* The name for this hierarchy - may be empty */
- char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
struct list_head node;
struct dentry *dentry;
struct cftype *type;
+
+ /* file xattrs */
+ struct simple_xattrs xattrs;
};
/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
struct cftype cfts[], bool is_add);
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
-{
- return lockdep_is_held(&cgroup_mutex);
-}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
-{
- return mutex_is_locked(&cgroup_mutex);
-}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
-
static int css_unbias_refcnt(int refcnt)
{
return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
return test_bit(CGRP_REMOVED, &cgrp->flags);
}
-/* bits in struct cgroupfs_root flags field */
-enum {
- ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
- ROOT_XATTR, /* supports extended attributes */
-};
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor. It also returns %true
+ * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+ while (cgrp) {
+ if (cgrp == ancestor)
+ return true;
+ cgrp = cgrp->parent;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(cgroup_is_descendant);
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
return __d_cfe(dentry)->type;
}
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the mutex should be later unlocked. On
+ * failure returns false with no lock held.
+ */
+static bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+ mutex_lock(&cgroup_mutex);
+ if (cgroup_is_removed(cgrp)) {
+ mutex_unlock(&cgroup_mutex);
+ return false;
+ }
+ return true;
+}
+
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
@@ -376,22 +353,18 @@ static int css_set_count;
* account cgroups in empty hierarchies.
*/
#define CSS_SET_HASH_BITS 7
-#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
- int index;
- unsigned long tmp = 0UL;
+ unsigned long key = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
- tmp += (unsigned long)css[i];
- tmp = (tmp >> 16) ^ tmp;
+ key += (unsigned long)css[i];
+ key = (key >> 16) ^ key;
- index = hash_long(tmp, CSS_SET_HASH_BITS);
-
- return &css_set_table[index];
+ return key;
}
/* We don't maintain the lists running through each css_set to its
@@ -418,7 +391,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
/* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +399,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
+
+ /*
+ * We may not be holding cgroup_mutex, and if cgrp->count is
+ * dropped to 0 the cgroup can be destroyed at any time, hence
+ * rcu_read_lock is used to keep it alive.
+ */
+ rcu_read_lock();
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+ rcu_read_unlock();
kfree(link);
}
@@ -550,9 +531,8 @@ static struct css_set *find_existing_css_set(
{
int i;
struct cgroupfs_root *root = cgrp->root;
- struct hlist_head *hhead;
- struct hlist_node *node;
struct css_set *cg;
+ unsigned long key;
/*
* Build the set of subsystem state objects that we want to see in the
@@ -572,8 +552,8 @@ static struct css_set *find_existing_css_set(
}
}
- hhead = css_set_hash(template);
- hlist_for_each_entry(cg, node, hhead, hlist) {
+ key = css_set_hash(template);
+ hash_for_each_possible(css_set_table, cg, hlist, key) {
if (!compare_css_sets(cg, oldcg, cgrp, template))
continue;
@@ -657,8 +637,8 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;
- struct hlist_head *hhead;
struct cg_cgroup_link *link;
+ unsigned long key;
/* First see if we already have a cgroup group that matches
* the desired set */
@@ -704,8 +684,8 @@ static struct css_set *find_css_set(
css_set_count++;
/* Add this cgroup group to the hash table */
- hhead = css_set_hash(res->subsys);
- hlist_add_head(&res->hlist, hhead);
+ key = css_set_hash(res->subsys);
+ hash_add(css_set_table, &res->hlist, key);
write_unlock(&css_set_lock);
@@ -797,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
-{
- mutex_lock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_lock);
-
-/**
- * cgroup_unlock - release lock on cgroup changes
- *
- * Undo the lock taken in a previous cgroup_lock() call.
- */
-void cgroup_unlock(void)
-{
- mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unlock);
-
/*
* A couple of forward declarations required, due to cyclic reference loop:
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -856,57 +815,84 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
{
- /* is dentry a directory ? if so, kfree() associated cgroup */
- if (S_ISDIR(inode->i_mode)) {
- struct cgroup *cgrp = dentry->d_fsdata;
- struct cgroup_subsys *ss;
- BUG_ON(!(cgroup_is_removed(cgrp)));
- /* It's possible for external users to be holding css
- * reference counts on a cgroup; css_put() needs to
- * be able to access the cgroup after decrementing
- * the reference count in order to know if it needs to
- * queue the cgroup to be handled by the release
- * agent */
- synchronize_rcu();
+ struct cgroup_name *name;
- mutex_lock(&cgroup_mutex);
- /*
- * Release the subsystem state objects.
- */
- for_each_subsys(cgrp->root, ss)
- ss->css_free(cgrp);
+ name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ strcpy(name->name, dentry->d_name.name);
+ return name;
+}
- cgrp->root->number_of_cgroups--;
- mutex_unlock(&cgroup_mutex);
+static void cgroup_free_fn(struct work_struct *work)
+{
+ struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+ struct cgroup_subsys *ss;
- /*
- * Drop the active superblock reference that we took when we
- * created the cgroup
- */
- deactivate_super(cgrp->root->sb);
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss)
+ ss->css_free(cgrp);
- /*
- * if we're getting rid of the cgroup, refcount should ensure
- * that there are no pidlists left.
- */
- BUG_ON(!list_empty(&cgrp->pidlists));
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
+
+ /*
+ * We get a ref to the parent's dentry, and put the ref when
+ * this cgroup is being freed, so it's guaranteed that the
+ * parent won't be destroyed before its children.
+ */
+ dput(cgrp->parent->dentry);
+
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+
+ /*
+ * Drop the active superblock reference that we took when we
+ * created the cgroup. This will free cgrp->root, if we are
+ * holding the last reference to @sb.
+ */
+ deactivate_super(cgrp->root->sb);
+
+ /*
+ * if we're getting rid of the cgroup, refcount should ensure
+ * that there are no pidlists left.
+ */
+ BUG_ON(!list_empty(&cgrp->pidlists));
+
+ simple_xattrs_free(&cgrp->xattrs);
- simple_xattrs_free(&cgrp->xattrs);
+ kfree(rcu_dereference_raw(cgrp->name));
+ kfree(cgrp);
+}
+
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+ struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+
+ schedule_work(&cgrp->free_work);
+}
+
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+ /* is dentry a directory ? if so, kfree() associated cgroup */
+ if (S_ISDIR(inode->i_mode)) {
+ struct cgroup *cgrp = dentry->d_fsdata;
- ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
- kfree_rcu(cgrp, rcu_head);
+ BUG_ON(!(cgroup_is_removed(cgrp)));
+ call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
} else {
struct cfent *cfe = __d_cfe(dentry);
struct cgroup *cgrp = dentry->d_parent->d_fsdata;
- struct cftype *cft = cfe->type;
WARN_ONCE(!list_empty(&cfe->node) &&
cgrp != &cgrp->root->top_cgroup,
"cfe still linked for %s\n", cfe->type->name);
+ simple_xattrs_free(&cfe->xattrs);
kfree(cfe);
- simple_xattrs_free(&cft->xattrs);
}
iput(inode);
}
@@ -925,13 +911,17 @@ static void remove_dir(struct dentry *d)
dput(parent);
}
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
{
struct cfent *cfe;
lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * If we're doing cleanup due to failure of cgroup_create(),
+ * the corresponding @cfe may not exist.
+ */
list_for_each_entry(cfe, &cgrp->files, node) {
struct dentry *d = cfe->dentry;
@@ -944,9 +934,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
list_del_init(&cfe->node);
dput(d);
- return 0;
+ break;
}
- return -ENOENT;
}
/**
@@ -1083,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
}
}
root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
- synchronize_rcu();
return 0;
}
@@ -1096,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
- if (test_bit(ROOT_NOPREFIX, &root->flags))
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+ seq_puts(seq, ",sane_behavior");
+ if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
- if (test_bit(ROOT_XATTR, &root->flags))
+ if (root->flags & CGRP_ROOT_XATTR)
seq_puts(seq, ",xattr");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1160,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
all_ss = true;
continue;
}
+ if (!strcmp(token, "__DEVEL__sane_behavior")) {
+ opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+ continue;
+ }
if (!strcmp(token, "noprefix")) {
- set_bit(ROOT_NOPREFIX, &opts->flags);
+ opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
@@ -1169,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
}
if (!strcmp(token, "xattr")) {
- set_bit(ROOT_XATTR, &opts->flags);
+ opts->flags |= CGRP_ROOT_XATTR;
continue;
}
if (!strncmp(token, "release_agent=", 14)) {
@@ -1247,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/* Consistency checks */
+ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+ if (opts->flags & CGRP_ROOT_NOPREFIX) {
+ pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+ return -EINVAL;
+ }
+
+ if (opts->cpuset_clone_children) {
+ pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+ return -EINVAL;
+ }
+ }
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
- (opts->subsys_mask & mask))
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
@@ -1324,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask;
+ if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("cgroup: sane_behavior: remount is not allowed\n");
+ return -EINVAL;
+ }
+
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
@@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->allcg_node);
INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
+ INIT_WORK(&cgrp->free_work, cgroup_free_fn);
mutex_init(&cgrp->pidlist_mutex);
INIT_LIST_HEAD(&cgrp->event_list);
spin_lock_init(&cgrp->event_list_lock);
@@ -1408,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
INIT_LIST_HEAD(&root->allcg_list);
root->number_of_cgroups = 1;
cgrp->root = root;
- cgrp->top_cgroup = cgrp;
+ cgrp->name = &root_cgroup_name;
init_cgroup_housekeeping(cgrp);
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
}
@@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
+ struct css_set *cg;
BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct hlist_head *hhead = &css_set_table[i];
- struct hlist_node *node;
- struct css_set *cg;
-
- hlist_for_each_entry(cg, node, hhead, hlist)
- link_css_set(&tmp_cg_links, cg, root_cgrp);
- }
+ hash_for_each(css_set_table, i, cg, hlist)
+ link_css_set(&tmp_cg_links, cg, root_cgrp);
write_unlock(&css_set_lock);
free_cg_links(&tmp_cg_links);
@@ -1677,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
* any) is not needed
*/
cgroup_drop_root(opts.new_root);
+
+ if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
+ root->flags != opts.flags) {
+ pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto drop_new_super;
+ }
+
/* no subsys rebinding, so refcounts don't change */
drop_parsed_module_refcounts(opts.subsys_mask);
}
@@ -1761,49 +1777,48 @@ static struct kobject *cgroup_kobj;
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference. Writes path of cgroup into buf. Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
{
- struct dentry *dentry = cgrp->dentry;
+ int ret = -ENAMETOOLONG;
char *start;
- rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
- "cgroup_path() called without proper locking");
-
- if (!dentry || cgrp == dummytop) {
- /*
- * Inactive subsystems have no dentry for their root
- * cgroup
- */
- strcpy(buf, "/");
+ if (!cgrp->parent) {
+ if (strlcpy(buf, "/", buflen) >= buflen)
+ return -ENAMETOOLONG;
return 0;
}
start = buf + buflen - 1;
-
*start = '\0';
- for (;;) {
- int len = dentry->d_name.len;
+ rcu_read_lock();
+ do {
+ const char *name = cgroup_name(cgrp);
+ int len;
+
+ len = strlen(name);
if ((start -= len) < buf)
- return -ENAMETOOLONG;
- memcpy(start, dentry->d_name.name, len);
- cgrp = cgrp->parent;
- if (!cgrp)
- break;
+ goto out;
+ memcpy(start, name, len);
- dentry = cgrp->dentry;
- if (!cgrp->parent)
- continue;
if (--start < buf)
- return -ENAMETOOLONG;
+ goto out;
*start = '/';
- }
+
+ cgrp = cgrp->parent;
+ } while (cgrp->parent);
+ ret = 0;
memmove(buf, start, buf + buflen - start);
- return 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path);
@@ -1892,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
*
* Must be called with cgroup_mutex and threadgroup locked.
*/
-static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *oldcgrp,
struct task_struct *tsk, struct css_set *newcg)
{
struct css_set *oldcg;
@@ -1925,122 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
}
/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
- *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
- */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
- int retval = 0;
- struct cgroup_subsys *ss, *failed_ss = NULL;
- struct cgroup *oldcgrp;
- struct cgroupfs_root *root = cgrp->root;
- struct cgroup_taskset tset = { };
- struct css_set *newcg;
-
- /* @tsk either already exited or can't exit until the end */
- if (tsk->flags & PF_EXITING)
- return -ESRCH;
-
- /* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- return 0;
-
- tset.single.task = tsk;
- tset.single.cgrp = oldcgrp;
-
- for_each_subsys(root, ss) {
- if (ss->can_attach) {
- retval = ss->can_attach(cgrp, &tset);
- if (retval) {
- /*
- * Remember on which subsystem the can_attach()
- * failed, so that we only call cancel_attach()
- * against the subsystems whose can_attach()
- * succeeded. (See below)
- */
- failed_ss = ss;
- goto out;
- }
- }
- }
-
- newcg = find_css_set(tsk->cgroups, cgrp);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
-
- for_each_subsys(root, ss) {
- if (ss->attach)
- ss->attach(cgrp, &tset);
- }
-
- synchronize_rcu();
-out:
- if (retval) {
- for_each_subsys(root, ss) {
- if (ss == failed_ss)
- /*
- * This subsystem was the one that failed the
- * can_attach() check earlier, so we don't need
- * to call cancel_attach() against it or any
- * remaining subsystems.
- */
- break;
- if (ss->cancel_attach)
- ss->cancel_attach(cgrp, &tset);
- }
- }
- return retval;
-}
-
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroupfs_root *root;
- int retval = 0;
-
- cgroup_lock();
- for_each_active_root(root) {
- struct cgroup *from_cg = task_cgroup_from_root(from, root);
-
- retval = cgroup_attach_task(from_cg, tsk);
- if (retval)
- break;
- }
- cgroup_unlock();
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-/**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
* @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * @tsk: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
*
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
*/
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+ bool threadgroup)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
- /* guaranteed to be initialized later, but the compiler needs this */
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
- struct task_struct *tsk;
+ struct task_struct *leader = tsk;
struct task_and_cgroup *tc;
struct flex_array *group;
struct cgroup_taskset tset = { };
@@ -2052,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* group - group_rwsem prevents new threads from appearing, and if
* threads exit, this will just be an over-estimate.
*/
- group_size = get_nr_threads(leader);
+ if (threadgroup)
+ group_size = get_nr_threads(tsk);
+ else
+ group_size = 1;
/* flex_array supports very large thread-groups better than kmalloc. */
group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
if (retval)
goto out_free_group_list;
- tsk = leader;
i = 0;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2091,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
+
+ if (!threadgroup)
+ break;
} while_each_thread(leader, tsk);
rcu_read_unlock();
/* remember the number of threads in the array for later. */
@@ -2136,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
for (i = 0; i < group_size; i++) {
tc = flex_array_get(group, i);
- cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
+ cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
}
/* nothing is sensitive to fork() after this point. */
@@ -2151,7 +2071,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
/*
* step 5: success! and cleanup
*/
- synchronize_rcu();
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -2218,11 +2137,11 @@ retry_find_task:
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_THREAD_BOUND and become
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
* trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
rcu_read_unlock();
goto out_unlock_cgroup;
@@ -2245,17 +2164,42 @@ retry_find_task:
put_task_struct(tsk);
goto retry_find_task;
}
- ret = cgroup_attach_proc(cgrp, tsk);
- } else
- ret = cgroup_attach_task(cgrp, tsk);
+ }
+
+ ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
threadgroup_unlock(tsk);
put_task_struct(tsk);
out_unlock_cgroup:
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
return ret;
}
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroupfs_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ for_each_active_root(root) {
+ struct cgroup *from_cg = task_cgroup_from_root(from, root);
+
+ retval = cgroup_attach_task(from_cg, tsk, false);
+ if (retval)
+ break;
+ }
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
return attach_task_by_pid(cgrp, pid, false);
@@ -2266,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
return attach_task_by_pid(cgrp, tgid, true);
}
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
- mutex_lock(&cgroup_mutex);
- if (cgroup_is_removed(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- return false;
- }
- return true;
-}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
-
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
@@ -2295,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
mutex_lock(&cgroup_root_mutex);
strcpy(cgrp->root->release_agent_path, buffer);
mutex_unlock(&cgroup_root_mutex);
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -2306,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
return -ENODEV;
seq_puts(seq, cgrp->root->release_agent_path);
seq_putc(seq, '\n');
- cgroup_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *seq)
+{
+ seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
return 0;
}
@@ -2531,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ int ret;
+ struct cgroup_name *name, *old_name;
+ struct cgroup *cgrp;
+
+ /*
+ * It's convinient to use parent dir's i_mutex to protected
+ * cgrp->name.
+ */
+ lockdep_assert_held(&old_dir->i_mutex);
+
if (!S_ISDIR(old_dentry->d_inode->i_mode))
return -ENOTDIR;
if (new_dentry->d_inode)
return -EEXIST;
if (old_dir != new_dir)
return -EIO;
- return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+ cgrp = __d_cgrp(old_dentry);
+
+ name = cgroup_alloc_name(new_dentry);
+ if (!name)
+ return -ENOMEM;
+
+ ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ old_name = cgrp->name;
+ rcu_assign_pointer(cgrp->name, name);
+
+ kfree_rcu(old_name, rcu_head);
+ return 0;
}
static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2545,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
if (S_ISDIR(dentry->d_inode->i_mode))
return &__d_cgrp(dentry)->xattrs;
else
- return &__d_cft(dentry)->xattrs;
+ return &__d_cfe(dentry)->xattrs;
}
static inline int xattr_enabled(struct dentry *dentry)
{
struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
- return test_bit(ROOT_XATTR, &root->flags);
+ return root->flags & CGRP_ROOT_XATTR;
}
static bool is_valid_xattr(const char *name)
@@ -2637,7 +2597,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
*/
static inline struct cftype *__file_cft(struct file *file)
{
- if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+ if (file_inode(file)->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
@@ -2721,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
- simple_xattrs_init(&cft->xattrs);
-
- if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
+ if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
strcpy(name, subsys->name);
strcat(name, ".");
}
@@ -2747,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
cfe->type = (void *)cft;
cfe->dentry = dentry;
dentry->d_fsdata = cfe;
+ simple_xattrs_init(&cfe->xattrs);
list_add_tail(&cfe->node, &parent->files);
cfe = NULL;
}
@@ -2764,19 +2723,21 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
continue;
- if (is_add)
+ if (is_add) {
err = cgroup_add_file(cgrp, subsys, cft);
- else
- err = cgroup_rm_file(cgrp, cft);
- if (err) {
- pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
- is_add ? "add" : "remove", cft->name, err);
+ if (err)
+ pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+ cft->name, err);
ret = err;
+ } else {
+ cgroup_rm_file(cgrp, cft);
}
}
return ret;
@@ -3017,6 +2978,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos. If there's no descendant,
+ * @pos is returned. This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+ struct cgroup *last, *tmp;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ last = pos;
+ /* ->prev isn't RCU safe, walk ->next till the end */
+ pos = NULL;
+ list_for_each_entry_rcu(tmp, &last->children, sibling)
+ pos = tmp;
+ } while (pos);
+
+ return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+
static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
struct cgroup *last;
@@ -3268,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
return 0;
}
+static void cgroup_transfer_one_task(struct task_struct *task,
+ struct cgroup_scanner *scan)
+{
+ struct cgroup *new_cgroup = scan->data;
+
+ mutex_lock(&cgroup_mutex);
+ cgroup_attach_task(new_cgroup, task, false);
+ mutex_unlock(&cgroup_mutex);
+}
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ struct cgroup_scanner scan;
+
+ scan.cg = from;
+ scan.test_task = NULL; /* select all tasks in cgroup */
+ scan.process_task = cgroup_transfer_one_task;
+ scan.heap = NULL;
+ scan.data = to;
+
+ return cgroup_scan_tasks(&scan);
+}
+
/*
* Stuff for reading the 'tasks'/'procs' files.
*
@@ -3330,35 +3345,14 @@ static void pidlist_free(void *p)
else
kfree(p);
}
-static void *pidlist_resize(void *p, int newcount)
-{
- void *newlist;
- /* note: if new alloc fails, old p will still be valid either way */
- if (is_vmalloc_addr(p)) {
- newlist = vmalloc(newcount * sizeof(pid_t));
- if (!newlist)
- return NULL;
- memcpy(newlist, p, newcount * sizeof(pid_t));
- vfree(p);
- } else {
- newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
- }
- return newlist;
-}
/*
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
+ * Returns the number of unique elements.
*/
-/* is the size difference enough that we should re-allocate the array? */
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
+static int pidlist_uniq(pid_t *list, int length)
{
int src, dest = 1;
- pid_t *list = *p;
- pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
@@ -3379,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
dest++;
}
after:
- /*
- * if the length difference is large enough, we want to allocate a
- * smaller buffer to save memory. if this fails due to out of memory,
- * we'll just stay with what we've got.
- */
- if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
- newlist = pidlist_resize(list, dest);
- if (newlist)
- *p = newlist;
- }
return dest;
}
@@ -3484,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(&array, length);
+ length = pidlist_uniq(array, length);
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
@@ -3752,8 +3736,13 @@ static void cgroup_event_remove(struct work_struct *work)
remove);
struct cgroup *cgrp = event->cgrp;
+ remove_wait_queue(event->wqh, &event->wait);
+
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
eventfd_ctx_put(event->eventfd);
kfree(event);
dput(cgrp->dentry);
@@ -3773,15 +3762,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
- __remove_wait_queue(event->wqh, &event->wait);
- spin_lock(&cgrp->event_list_lock);
- list_del_init(&event->list);
- spin_unlock(&cgrp->event_list_lock);
/*
- * We are in atomic context, but cgroup_event_remove() may
- * sleep, so we have to call it in workqueue.
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
*/
- schedule_work(&event->remove);
+ spin_lock(&cgrp->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&cgrp->event_list_lock);
}
return 0;
@@ -3807,6 +3806,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
+ struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
@@ -3852,7 +3852,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+ ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
@@ -3862,6 +3862,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
goto fail;
}
+ /*
+ * The file to be monitored must be in the same cgroup as
+ * cgroup.event_control is.
+ */
+ cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+ if (cgrp_cfile != cgrp) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
@@ -3872,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
if (ret)
goto fail;
- if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
- event->cft->unregister_event(cgrp, event->cft, event->eventfd);
- ret = 0;
- goto fail;
- }
+ efile->f_op->poll(efile, &event->pt);
/*
* Events should be removed after rmdir of cgroup directory, but before
@@ -3958,10 +3964,16 @@ static struct cftype files[] = {
},
{
.name = "cgroup.clone_children",
+ .flags = CFTYPE_INSANE,
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .read_seq_string = cgroup_sane_behavior_show,
+ },
+ {
.name = "release_agent",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
@@ -4073,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (!(css->flags & CSS_ONLINE))
return;
- /*
- * css_offline() should be called with cgroup_mutex unlocked. See
- * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
- * details. This temporary unlocking should go away once
- * cgroup_mutex is unexported from controllers.
- */
- if (ss->css_offline) {
- mutex_unlock(&cgroup_mutex);
+ if (ss->css_offline)
ss->css_offline(cgrp);
- mutex_lock(&cgroup_mutex);
- }
cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
}
@@ -4100,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
umode_t mode)
{
struct cgroup *cgrp;
+ struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
@@ -4110,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (!cgrp)
return -ENOMEM;
+ name = cgroup_alloc_name(dentry);
+ if (!name)
+ goto err_free_cgrp;
+ rcu_assign_pointer(cgrp->name, name);
+
cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
if (cgrp->id < 0)
- goto err_free_cgrp;
+ goto err_free_name;
/*
* Only live parents can have children. Note that the liveliness
@@ -4135,9 +4144,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
init_cgroup_housekeeping(cgrp);
+ dentry->d_fsdata = cgrp;
+ cgrp->dentry = dentry;
+
cgrp->parent = parent;
cgrp->root = parent->root;
- cgrp->top_cgroup = parent->top_cgroup;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4172,8 +4183,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
lockdep_assert_held(&dentry->d_inode->i_mutex);
/* allocation complete, commit to creation */
- dentry->d_fsdata = cgrp;
- cgrp->dentry = dentry;
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
@@ -4182,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
for_each_subsys(root, ss)
dget(dentry);
+ /* hold a ref to the parent's dentry */
+ dget(parent->dentry);
+
/* creation succeeded, notify subsystems */
for_each_subsys(root, ss) {
err = online_css(ss, cgrp);
@@ -4217,6 +4229,8 @@ err_free_all:
deactivate_super(sb);
err_free_id:
ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+ kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
kfree(cgrp);
return err;
@@ -4236,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
- int i;
-
- /*
- * We won't need to lock the subsys array, because the subsystems
- * we're concerned about aren't going anywhere since our cgroup root
- * has a reference on them.
- */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- struct cgroup_subsys_state *css;
-
- /* Skip subsystems not present or not in this hierarchy */
- if (ss == NULL || ss->root != cgrp->root)
- continue;
-
- css = cgrp->subsys[ss->subsys_id];
- /*
- * When called from check_for_release() it's possible
- * that by this point the cgroup has been removed
- * and the css deleted. But a false-positive doesn't
- * matter, since it can only happen if the cgroup
- * has been deleted and hence no longer needs the
- * release agent to be called anyway.
- */
- if (css && css_refcnt(css) > 1)
- return 1;
- }
- return 0;
-}
-
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct dentry *d = cgrp->dentry;
struct cgroup *parent = cgrp->parent;
- DEFINE_WAIT(wait);
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
- LIST_HEAD(tmp_list);
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
@@ -4340,20 +4311,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
- * directory to avoid race between userspace and kernelspace. Use
- * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
- * cgroup_event_wake() is called with the wait queue head locked,
- * remove_wait_queue() cannot be called while holding event_list_lock.
+ * directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
- list_splice_init(&cgrp->event_list, &tmp_list);
- spin_unlock(&cgrp->event_list_lock);
- list_for_each_entry_safe(event, tmp, &tmp_list, list) {
+ list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
- remove_wait_queue(event->wqh, &event->wait);
- eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
+ spin_unlock(&cgrp->event_list_lock);
return 0;
}
@@ -4415,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- ss->active = 1;
BUG_ON(online_css(ss, dummytop));
mutex_unlock(&cgroup_mutex);
@@ -4438,6 +4402,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
int i, ret;
+ struct hlist_node *tmp;
+ struct css_set *cg;
+ unsigned long key;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,27 +4470,20 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
- struct css_set *cg;
- struct hlist_node *node, *tmp;
- struct hlist_head *bucket = &css_set_table[i], *new_bucket;
-
- hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
- /* skip entries that we already rehashed */
- if (cg->subsys[ss->subsys_id])
- continue;
- /* remove existing entry */
- hlist_del(&cg->hlist);
- /* set new value */
- cg->subsys[ss->subsys_id] = css;
- /* recompute hash and restore entry */
- new_bucket = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, new_bucket);
- }
+ hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+ /* skip entries that we already rehashed */
+ if (cg->subsys[ss->subsys_id])
+ continue;
+ /* remove existing entry */
+ hash_del(&cg->hlist);
+ /* set new value */
+ cg->subsys[ss->subsys_id] = css;
+ /* recompute hash and restore entry */
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
- ss->active = 1;
ret = online_css(ss, dummytop);
if (ret)
goto err_unload;
@@ -4551,7 +4511,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
- struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
@@ -4565,12 +4524,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
mutex_lock(&cgroup_mutex);
offline_css(ss, dummytop);
- ss->active = 0;
- if (ss->use_id) {
- idr_remove_all(&ss->idr);
+ if (ss->use_id)
idr_destroy(&ss->idr);
- }
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
@@ -4585,11 +4541,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
+ unsigned long key;
- hlist_del(&cg->hlist);
+ hash_del(&cg->hlist);
cg->subsys[ss->subsys_id] = NULL;
- hhead = css_set_hash(cg->subsys);
- hlist_add_head(&cg->hlist, hhead);
+ key = css_set_hash(cg->subsys);
+ hash_add(css_set_table, &cg->hlist, key);
}
write_unlock(&css_set_lock);
@@ -4631,9 +4588,6 @@ int __init cgroup_init_early(void)
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
- for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
- INIT_HLIST_HEAD(&css_set_table[i]);
-
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
@@ -4667,7 +4621,7 @@ int __init cgroup_init(void)
{
int err;
int i;
- struct hlist_head *hhead;
+ unsigned long key;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
@@ -4686,8 +4640,8 @@ int __init cgroup_init(void)
}
/* Add init_css_set to the hash table */
- hhead = css_set_hash(init_css_set.subsys);
- hlist_add_head(&init_css_set.hlist, hhead);
+ key = css_set_hash(init_css_set.subsys);
+ hash_add(css_set_table, &init_css_set.hlist, key);
BUG_ON(!init_root_id(&rootnode));
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4724,7 +4678,7 @@ out:
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
@@ -4776,19 +4730,6 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
@@ -4890,17 +4831,17 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
if (need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * fork/exit callbacks are supported only for builtin
+ * subsystems, and the builtin section of the subsys
+ * array is immutable, so we don't need to lock the
+ * subsys array here. On the other hand, modular section
+ * of the array can be freed at module unload, so we
+ * can't touch that.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- /*
- * fork/exit callbacks are supported only for
- * builtin subsystems and we don't need further
- * synchronization as they never go away.
- */
- if (!ss || ss->module)
- continue;
-
if (ss->fork)
ss->fork(child);
}
@@ -4965,13 +4906,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
tsk->cgroups = &init_css_set;
if (run_callbacks && need_forkexit_callback) {
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ /*
+ * fork/exit callbacks are supported only for builtin
+ * subsystems, see cgroup_post_fork() for details.
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- /* modular subsystems can't use callbacks */
- if (!ss || ss->module)
- continue;
-
if (ss->exit) {
struct cgroup *old_cgrp =
rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -4982,48 +4923,22 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
- int ret;
- struct cgroup *target;
-
- if (cgrp == dummytop)
- return 1;
-
- target = task_cgroup_from_root(task, cgrp->root);
- while (cgrp != target && cgrp!= cgrp->top_cgroup)
- cgrp = cgrp->parent;
- ret = (cgrp == target);
- return ret;
+ put_css_set_taskexit(cg);
}
static void check_for_release(struct cgroup *cgrp)
{
/* All of these checks rely on RCU to keep the cgroup
* structure alive */
- if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
- && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
- /* Control Group is currently removeable. If it's not
+ if (cgroup_is_releasable(cgrp) &&
+ !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+ /*
+ * Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
- * it now */
+ * it now
+ */
int need_schedule_work = 0;
+
raw_spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
@@ -5056,24 +4971,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
/* Caller must verify that the css is not for root cgroup */
void __css_put(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = css->cgroup;
int v;
- rcu_read_lock();
v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-
- switch (v) {
- case 1:
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
- break;
- case 0:
+ if (v == 0)
schedule_work(&css->dput_work);
- break;
- }
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(__css_put);
@@ -5274,7 +5176,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
- int myid, error, size;
+ int ret, size;
BUG_ON(!ss->use_id);
@@ -5282,35 +5184,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
- /* get id */
- if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
- error = -ENOMEM;
- goto err_out;
- }
+
+ idr_preload(GFP_KERNEL);
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
- error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+ ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
spin_unlock(&ss->id_lock);
+ idr_preload_end();
/* Returns error when there are no free spaces for new ID.*/
- if (error) {
- error = -ENOSPC;
+ if (ret < 0)
goto err_out;
- }
- if (myid > CSS_ID_MAX)
- goto remove_idr;
- newid->id = myid;
+ newid->id = ret;
newid->depth = depth;
return newid;
-remove_idr:
- error = -ENOSPC;
- spin_lock(&ss->id_lock);
- idr_remove(&ss->idr, myid);
- spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
- return ERR_PTR(error);
+ return ERR_PTR(ret);
}
@@ -5383,55 +5274,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
}
EXPORT_SYMBOL_GPL(css_lookup);
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
-{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- tmp = idr_get_next(&ss->idr, &tmpid);
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
-}
-
/*
* get corresponding css from file open on cgroupfs directory
*/
@@ -5441,7 +5283,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
struct inode *inode;
struct cgroup_subsys_state *css;
- inode = f->f_dentry->d_inode;
+ inode = file_inode(f);
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);
diff --git a/kernel/compat.c b/kernel/compat.c
index 36700e9e2be9..0a09e481b70b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
__put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
}
-asmlinkage long compat_sys_getitimer(int which,
- struct compat_itimerval __user *it)
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+ struct compat_itimerval __user *, it)
{
struct itimerval kit;
int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
return error;
}
-asmlinkage long compat_sys_setitimer(int which,
- struct compat_itimerval __user *in,
- struct compat_itimerval __user *out)
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+ struct compat_itimerval __user *, in,
+ struct compat_itimerval __user *, out)
{
struct itimerval kin, kout;
int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
memcpy(blocked->sig, &set, sizeof(set));
}
-asmlinkage long compat_sys_sigprocmask(int how,
- compat_old_sigset_t __user *nset,
- compat_old_sigset_t __user *oset)
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
+ compat_old_sigset_t __user *, nset,
+ compat_old_sigset_t __user *, oset)
{
old_sigset_t old_set, new_set;
sigset_t new_blocked;
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
COMPAT_SYSCALL_DEFINE4(wait4,
compat_pid_t, pid,
compat_uint_t __user *, stat_addr,
@@ -593,7 +574,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
else
ret = put_compat_rusage(&ru, uru);
if (ret)
- return ret;
+ return -EFAULT;
}
BUG_ON(info.si_code & __SI_MASK);
@@ -971,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
}
void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -982,10 +963,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
}
EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
- struct compat_siginfo __user *uinfo,
- struct compat_timespec __user *uts, compat_size_t sigsetsize)
+void
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+{
+ switch (_NSIG_WORDS) {
+ case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+ case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+ case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+ case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+ }
+}
+
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
{
compat_sigset_t s32;
sigset_t s;
@@ -1013,18 +1004,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
}
return ret;
-
-}
-
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
- struct compat_siginfo __user *uinfo)
-{
- siginfo_t info;
-
- if (copy_siginfo_from_user32(&info, uinfo))
- return -EFAULT;
- return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1067,23 +1046,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
- sigset_t newset;
- compat_sigset_t newset32;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&newset, &newset32);
- return sigsuspend(&newset);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
-
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
{
struct timex txc;
@@ -1157,74 +1119,9 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
-{
- struct sysinfo s;
-
- do_sysinfo(&s);
-
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
-
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
-
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
-
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
- return -EFAULT;
-
- return 0;
-}
-
-#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL
-asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
- struct compat_timespec __user *interval)
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
{
struct timespec t;
int ret;
@@ -1237,7 +1134,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
return -EFAULT;
return ret;
}
-#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */
/*
* Allocate user-space memory for the duration of a single system call,
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075eed..c18b1f1ae515 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ * Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+
#include <linux/context_tracking.h>
+#include <linux/kvm_host.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
-#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/export.h>
-struct context_tracking {
- /*
- * When active is false, hooks are not set to
- * minimize overhead: TIF flags are cleared
- * and calls to user_enter/exit are ignored. This
- * may be further optimized using static keys.
- */
- bool active;
- enum {
- IN_KERNEL = 0,
- IN_USER,
- } state;
-};
-
-static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
.active = true,
#endif
};
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ * enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
void user_enter(void)
{
unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
if (in_interrupt())
return;
+ /* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
local_irq_save(flags);
if (__this_cpu_read(context_tracking.active) &&
__this_cpu_read(context_tracking.state) != IN_USER) {
- __this_cpu_write(context_tracking.state, IN_USER);
+ /*
+ * At this stage, only low level arch entry code remains and
+ * then we'll run in userspace. We can assume there won't be
+ * any RCU read-side critical section until the next call to
+ * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+ * on the tick.
+ */
+ vtime_user_enter(current);
rcu_user_enter();
+ __this_cpu_write(context_tracking.state, IN_USER);
}
local_irq_restore(flags);
}
+
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ * exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
void user_exit(void)
{
unsigned long flags;
- /*
- * Some contexts may involve an exception occuring in an irq,
- * leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
- * This would mess up the dyntick_nesting count though. And rcu_irq_*()
- * helpers are enough to protect RCU uses inside the exception. So
- * just return immediately if we detect we are in an IRQ.
- */
if (in_interrupt())
return;
local_irq_save(flags);
if (__this_cpu_read(context_tracking.state) == IN_USER) {
- __this_cpu_write(context_tracking.state, IN_KERNEL);
+ /*
+ * We are going to run code that may use RCU. Inform
+ * RCU core about that (ie: we may need the tick again).
+ */
rcu_user_exit();
+ vtime_user_exit(current);
+ __this_cpu_write(context_tracking.state, IN_KERNEL);
}
local_irq_restore(flags);
}
+void guest_enter(void)
+{
+ if (vtime_accounting_enabled())
+ vtime_guest_enter(current);
+ else
+ __guest_enter();
+}
+EXPORT_SYMBOL_GPL(guest_enter);
+
+void guest_exit(void)
+{
+ if (vtime_accounting_enabled())
+ vtime_guest_exit(current);
+ else
+ __guest_exit();
+}
+EXPORT_SYMBOL_GPL(guest_exit);
+
+
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
void context_tracking_task_switch(struct task_struct *prev,
struct task_struct *next)
{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
+ cputime_t utime, stime;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
+ task_cputime(p, &utime, &stime);
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (p->utime || p->stime))
+ (utime || stime))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
"(state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ /* Park the stopper thread */
+ kthread_park(current);
return 0;
}
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
+obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..8b86c0c68edf
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,116 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!need_resched())
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+}
+
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ arch_cpu_idle_dead();
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ cpu_idle_poll();
+ } else {
+ current_clr_polling();
+ if (!need_resched()) {
+ stop_critical_timings();
+ rcu_idle_enter();
+ arch_cpu_idle();
+ WARN_ON_ONCE(irqs_disabled());
+ rcu_idle_exit();
+ start_critical_timings();
+ } else {
+ local_irq_enable();
+ }
+ current_set_polling();
+ }
+ arch_cpu_idle_exit();
+ }
+ tick_nohz_idle_exit();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ current_set_polling();
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..64b3f791bbe5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
#include <linux/cgroup.h>
/*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-
-/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
- struct cpuset *parent; /* my parent */
-
struct fmeter fmeter; /* memory_pressure filter */
+ /*
+ * Tasks are being attached to this cpuset. Used to prevent
+ * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+ */
+ int attach_in_progress;
+
/* partition number for rebuild_sched_domains() */
int pn;
/* for custom sched domain */
int relax_domain_level;
- /* used for walking a cpuset hierarchy */
- struct list_head stack_list;
+ struct work_struct hotplug_work;
};
/* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+ struct cgroup *pcgrp = cs->css.cgroup->parent;
+
+ if (pcgrp)
+ return cgroup_cs(pcgrp);
+ return NULL;
+}
+
#ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task)
{
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */
typedef enum {
+ CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
- CPUSET_CPU_OFFLINE,
- CPUSET_MEM_OFFLINE,
-};
-
/* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+ return test_bit(CS_ONLINE, &cs->flags);
+}
+
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+ .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+ (1 << CS_MEM_EXCLUSIVE)),
};
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs. Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
+ cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
+ if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs. Must be used
+ * with RCU read locked. The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
+ cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+ if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+
/*
- * There are two global mutexes guarding cpuset structures. The first
- * is the main control groups cgroup_mutex, accessed via
- * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
- * callback_mutex, below. They can nest. It is ok to first take
- * cgroup_mutex, then nest callback_mutex. We also require taking
- * task_lock() when dereferencing a task's cpuset pointer. See "The
- * task_lock() exception", at the end of this comment.
- *
- * A task must hold both mutexes to modify cpusets. If a task
- * holds cgroup_mutex, then it blocks others wanting that mutex,
- * ensuring that it is the only task able to also acquire callback_mutex
- * and be able to modify cpusets. It can perform various checks on
- * the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding cgroup_mutex. While it is
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets. Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
+ * and callback_mutex. The latter may nest inside the former. We also
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both mutexes to modify cpusets. If a task holds
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * is the only task able to also acquire callback_mutex and be able to
+ * modify cpusets. It can perform various checks on the cpuset structure
+ * first, knowing nothing will change. It can also allocate memory while
+ * just holding cpuset_mutex. While it is performing these checks, various
+ * callback routines can briefly acquire callback_mutex to query cpusets.
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
+ * everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,18 +261,19 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
+static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
/*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers. They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
+ * CPU / memory hotplug is handled asynchronously.
*/
-#define CPUSET_NAME_LEN (128)
-#define CPUSET_NODELIST_LEN (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
/*
* This is ugly, but preserves the userspace API for existing cpuset
@@ -289,7 +319,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else
@@ -314,7 +344,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY]))
- cs = cs->parent;
+ cs = parent_cs(cs);
if (cs)
nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]);
@@ -326,7 +356,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -346,7 +376,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cgroup_mutex.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +425,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -412,48 +442,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{
struct cgroup *cont;
struct cpuset *c, *par;
+ int ret;
+
+ rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- if (!is_cpuset_subset(cgroup_cs(cont), trial))
- return -EBUSY;
- }
+ ret = -EBUSY;
+ cpuset_for_each_child(c, cont, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
/* Remaining checks don't apply to root cpuset */
+ ret = 0;
if (cur == &top_cpuset)
- return 0;
+ goto out;
- par = cur->parent;
+ par = parent_cs(cur);
/* We must be a subset of our parent cpuset */
+ ret = -EACCES;
if (!is_cpuset_subset(trial, par))
- return -EACCES;
+ goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
*/
- list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
+ ret = -EINVAL;
+ cpuset_for_each_child(c, cont, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- return -EINVAL;
+ goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
- return -EINVAL;
+ goto out;
}
- /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
- if (cgroup_task_count(cur->css.cgroup)) {
- if (cpumask_empty(trial->cpus_allowed) ||
- nodes_empty(trial->mems_allowed)) {
- return -ENOSPC;
- }
- }
+ /*
+ * Cpusets with tasks - existing or newly being attached - can't
+ * have empty cpus_allowed or mems_allowed.
+ */
+ ret = -ENOSPC;
+ if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+ (cpumask_empty(trial->cpus_allowed) ||
+ nodes_empty(trial->mems_allowed)))
+ goto out;
- return 0;
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SMP
@@ -474,31 +514,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return;
}
-static void
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
+ struct cpuset *root_cs)
{
- LIST_HEAD(q);
-
- list_add(&c->stack_list, &q);
- while (!list_empty(&q)) {
- struct cpuset *cp;
- struct cgroup *cont;
- struct cpuset *child;
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
- if (cpumask_empty(cp->cpus_allowed))
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp doesn't have any CPU */
+ if (cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue;
+ }
if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp);
-
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
}
+ rcu_read_unlock();
}
/*
@@ -520,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +591,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -567,6 +599,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
+ struct cgroup *pos_cgrp;
doms = NULL;
dattr = NULL;
@@ -594,33 +627,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done;
csn = 0;
- list_add(&top_cpuset.stack_list, &q);
- while (!list_empty(&q)) {
- struct cgroup *cont;
- struct cpuset *child; /* scans child cpusets of cp */
-
- cp = list_first_entry(&q, struct cpuset, stack_list);
- list_del(q.next);
-
- if (cpumask_empty(cp->cpus_allowed))
- continue;
-
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
/*
- * All child cpusets contain a subset of the parent's cpus, so
- * just skip them, and then we call update_domain_attr_tree()
- * to calc relax_domain_level of the corresponding sched
- * domain.
+ * Continue traversing beyond @cp iff @cp has some CPUs and
+ * isn't load balancing. The former is obvious. The
+ * latter: All child cpusets contain a subset of the
+ * parent's cpus, so just skip them, and then we call
+ * update_domain_attr_tree() to calc relax_domain_level of
+ * the corresponding sched domain.
*/
- if (is_sched_load_balance(cp)) {
- csa[csn++] = cp;
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ !is_sched_load_balance(cp))
continue;
- }
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &q);
- }
- }
+ if (is_sched_load_balance(cp))
+ csa[csn++] = cp;
+
+ /* skip @cp's subtree */
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ }
+ rcu_read_unlock();
for (i = 0; i < csn; i++)
csa[i]->pn = i;
@@ -725,82 +752,50 @@ done:
/*
* Rebuild scheduler domains.
*
- * Call with neither cgroup_mutex held nor within get_online_cpus().
- * Takes both cgroup_mutex and get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
*
- * Cannot be directly called from cpuset code handling changes
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
+ * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ lockdep_assert_held(&cpuset_mutex);
get_online_cpus();
+ /*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+ if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ goto out;
+
/* Generate domain masks and attrs */
- cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-
+out:
put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
-{
-}
-
-static int generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
+static void rebuild_sched_domains_locked(void)
{
- *domains = NULL;
- return 1;
}
#endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
- queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
void rebuild_sched_domains(void)
{
- do_rebuild_sched_domains(NULL);
+ mutex_lock(&cpuset_mutex);
+ rebuild_sched_domains_locked();
+ mutex_unlock(&cpuset_mutex);
}
/**
@@ -808,7 +803,7 @@ void rebuild_sched_domains(void)
* @tsk: task to test
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Called for each task in a cgroup by cgroup_scan_tasks().
* Return nonzero if this tasks's cpus_allowed mask should be changed (in other
* words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +824,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
* cpus_allowed mask needs to be changed.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -842,7 +837,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -920,7 +915,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
heap_free(&heap);
if (is_load_balanced)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
return 0;
}
@@ -932,7 +927,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cgroup_mutex, so current's cpuset won't change
+ * Call holding cpuset_mutex, so current's cpuset won't change
* during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1002,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/*
* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
* of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
@@ -1016,7 +1011,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cgroup_mutex */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1038,7 @@ static void *cpuset_being_rebound;
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
@@ -1065,7 +1060,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cgroup_mutex, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1084,7 +1079,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1163,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
}
return 0;
@@ -1182,7 +1177,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* Called by cgroup_scan_tasks() for each task in a cgroup.
*
* We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
*/
static void cpuset_change_flag(struct task_struct *tsk,
struct cgroup_scanner *scan)
@@ -1195,7 +1190,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
* @cs: the cpuset in which each task's spread flags needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
@@ -1220,7 +1215,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1255,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- async_rebuild_sched_domains();
+ rebuild_sched_domains_locked();
if (spread_flag_changed)
update_tasks_flags(cs, &heap);
@@ -1368,54 +1363,68 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/*
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
struct cpuset *cs = cgroup_cs(cgrp);
struct task_struct *task;
int ret;
+ mutex_lock(&cpuset_mutex);
+
+ ret = -ENOSPC;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
+ goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) {
/*
- * Kthreads bound to specific cpus cannot be moved to a new
- * cpuset; we cannot change their cpu affinity and
- * isolating such threads by their set of allowed nodes is
- * unnecessary. Thus, cpusets are not applicable for such
- * threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before
- * cpus_allowed may be changed.
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
*/
- if (task->flags & PF_THREAD_BOUND)
- return -EINVAL;
- if ((ret = security_task_setscheduler(task)))
- return ret;
+ ret = -EINVAL;
+ if (task->flags & PF_NO_SETAFFINITY)
+ goto out_unlock;
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
}
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return ret;
+}
- return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
+{
+ mutex_lock(&cpuset_mutex);
+ cgroup_cs(cgrp)->attach_in_progress--;
+ mutex_unlock(&cpuset_mutex);
}
+/*
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there. Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
+
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
+ /* static bufs protected by cpuset_mutex */
+ static nodemask_t cpuset_attach_nodemask_from;
+ static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1432,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ mutex_lock(&cpuset_mutex);
+
+ /* prepare for attach */
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+
cgroup_taskset_for_each(task, cgrp, tset) {
/*
* can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1467,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
&cpuset_attach_nodemask_to);
mmput(mm);
}
+
+ cs->attach_in_progress--;
+
+ /*
+ * We may have raced with CPU/memory hotunplug. Trigger hotplug
+ * propagation if @cs doesn't have any CPU or memory. It will move
+ * the newly added tasks to the nearest parent which can execute.
+ */
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ schedule_cpuset_propagate_hotplug(cs);
+
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1500,13 @@ typedef enum {
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1540,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1563,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL;
break;
}
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1539,17 +1574,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf)
{
- int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs;
+ int retval = -ENODEV;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
+ /*
+ * CPU or memory hotunplug may leave @cs w/o any execution
+ * resources, in which case the hotplug code asynchronously updates
+ * configuration and transfers all tasks to the nearest ancestor
+ * which can execute.
+ *
+ * As writes to "cpus" or "mems" may restore @cs's execution
+ * resources, wait for the previously scheduled operations before
+ * proceeding, so that we don't end up keep removing tasks added
+ * after execution capability is restored.
+ *
+ * Flushing cpuset_hotplug_work is enough to synchronize against
+ * hotplug hanlding; however, cpuset_attach() may schedule
+ * propagation work directly. Flush the workqueue too.
+ */
+ flush_work(&cpuset_hotplug_work);
+ flush_workqueue(cpuset_propagate_hotplug_wq);
+
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
- goto out;
+ goto out_unlock;
}
switch (cft->private) {
@@ -1565,8 +1619,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
-out:
- cgroup_unlock();
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
return retval;
}
@@ -1790,15 +1844,12 @@ static struct cftype files[] = {
static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{
- struct cgroup *parent_cg = cont->parent;
- struct cgroup *tmp_cg;
- struct cpuset *parent, *cs;
+ struct cpuset *cs;
- if (!parent_cg)
+ if (!cont->parent)
return &top_cpuset.css;
- parent = cgroup_cs(parent_cg);
- cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1857,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
return ERR_PTR(-ENOMEM);
}
- cs->flags = 0;
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
+ INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1;
- cs->parent = parent;
+ return &cs->css;
+}
+
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *parent = parent_cs(cs);
+ struct cpuset *tmp_cs;
+ struct cgroup *pos_cg;
+
+ if (!parent)
+ return 0;
+
+ mutex_lock(&cpuset_mutex);
+
+ set_bit(CS_ONLINE, &cs->flags);
+ if (is_spread_page(parent))
+ set_bit(CS_SPREAD_PAGE, &cs->flags);
+ if (is_spread_slab(parent))
+ set_bit(CS_SPREAD_SLAB, &cs->flags);
+
number_of_cpusets++;
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
- goto skip_clone;
+ if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+ goto out_unlock;
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1903,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup.
*/
- list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
- struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
-
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
- goto skip_clone;
+ rcu_read_lock();
+ cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+ if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+ rcu_read_unlock();
+ goto out_unlock;
+ }
}
+ rcu_read_unlock();
mutex_lock(&callback_mutex);
cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
mutex_unlock(&callback_mutex);
-skip_clone:
- return &cs->css;
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ return 0;
+}
+
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+ struct cpuset *cs = cgroup_cs(cgrp);
+
+ mutex_lock(&cpuset_mutex);
+
+ if (is_sched_load_balance(cs))
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+
+ number_of_cpusets--;
+ clear_bit(CS_ONLINE, &cs->flags);
+
+ mutex_unlock(&cpuset_mutex);
}
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
*/
static void cpuset_css_free(struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- if (is_sched_load_balance(cs))
- update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-
- number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
@@ -1872,8 +1953,11 @@ static void cpuset_css_free(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
.css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.subsys_id = cpuset_subsys_id,
.base_cftypes = files,
@@ -1911,220 +1995,204 @@ int __init cpuset_init(void)
return 0;
}
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- struct cgroup *new_cgroup = scan->data;
-
- cgroup_attach_task(new_cgroup, tsk);
-}
-
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
- struct cgroup_scanner scan;
-
- scan.cg = from->css.cgroup;
- scan.test_task = NULL; /* select all tasks in cgroup */
- scan.process_task = cpuset_do_move_task;
- scan.heap = NULL;
- scan.data = to->css.cgroup;
-
- if (cgroup_scan_tasks(&scan))
- printk(KERN_ERR "move_member_tasks_to_cpuset: "
- "cgroup_scan_tasks failed\n");
-}
-
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
- * The cgroup's css_sets list is in use if there are tasks
- * in the cpuset; the list is empty if there are none;
- * the cs->css.refcnt seems always 0.
- */
- if (list_empty(&cs->css.cgroup->css_sets))
- return;
-
- /*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
- parent = cs->parent;
+ parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
- parent = parent->parent;
+ parent = parent_cs(parent);
- move_member_tasks_to_cpuset(cs, parent);
+ if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+ rcu_read_lock();
+ printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
+ cgroup_name(cs->css.cgroup));
+ rcu_read_unlock();
+ }
}
-/*
- * Helper function to traverse cpusets.
- * It can be used to walk the cpuset tree from top to bottom, completing
- * one layer before dropping down to the next (thus always processing a
- * node before any of its children).
+/**
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
+ * @cs: cpuset in interest
+ *
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
*/
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
{
- struct cpuset *cp;
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
+ static cpumask_t off_cpus;
+ static nodemask_t off_mems, tmp_mems;
+ struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+ bool is_empty;
- if (list_empty(queue))
- return NULL;
+ mutex_lock(&cpuset_mutex);
+
+ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+ nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+
+ /* remove offline cpus from @cs */
+ if (!cpumask_empty(&off_cpus)) {
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
+ update_tasks_cpumask(cs, NULL);
+ }
- cp = list_first_entry(queue, struct cpuset, stack_list);
- list_del(queue->next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, queue);
+ /* remove offline mems from @cs */
+ if (!nodes_empty(off_mems)) {
+ tmp_mems = cs->mems_allowed;
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(cs, &tmp_mems, NULL);
}
- return cp;
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * If @cs became empty, move tasks to the nearest ancestor with
+ * execution resources. This is full cgroup operation which will
+ * also call back into cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ /* the following may free @cs, should be the last operation */
+ css_put(&cs->css);
}
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+ /*
+ * Pin @cs. The refcnt will be released when the work item
+ * finishes executing.
+ */
+ if (!css_tryget(&cs->css))
+ return;
-/*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
+ /*
+ * Queue @cs->hotplug_work. If already pending, lose the css ref.
+ * cpuset_propagate_hotplug_wq is ordered and propagation will
+ * happen in the order this function is called.
+ */
+ if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+ css_put(&cs->css);
+}
+
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
- * Called with cgroup_mutex held. We take callback_mutex to modify
- * cpus_allowed and mems_allowed.
+ * This function is called after either CPU or memory configuration has
+ * changed and updates cpuset accordingly. The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
*
- * This walk processes the tree from top to bottom, completing one layer
- * before dropping down to the next. It always processes a node before
- * any of its children.
+ * Non-root cpusets are only affected by offlining. If any CPUs or memory
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
+ * descendants.
*
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
- * if all present pages from a node are offlined.
+ * Note that CPU offlining during suspend is ignored. We don't modify
+ * cpusets across suspend/resume cycles at all.
*/
-static void
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
+static void cpuset_hotplug_workfn(struct work_struct *work)
{
- LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- static nodemask_t oldmems; /* protected by cgroup_mutex */
+ static cpumask_t new_cpus, tmp_cpus;
+ static nodemask_t new_mems, tmp_mems;
+ bool cpus_updated, mems_updated;
+ bool cpus_offlined, mems_offlined;
- list_add_tail((struct list_head *)&root->stack_list, &queue);
+ mutex_lock(&cpuset_mutex);
- switch (event) {
- case CPUSET_CPU_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* fetch the available cpus/mems and find out which changed how */
+ cpumask_copy(&new_cpus, cpu_active_mask);
+ new_mems = node_states[N_MEMORY];
- /* Continue past cpusets with all cpus online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
- continue;
+ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+ cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
+ &new_cpus);
- /* Remove offline cpus from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- mutex_unlock(&callback_mutex);
+ mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
+ mems_offlined = !nodes_empty(tmp_mems);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_cpumask(cp, NULL);
- }
- break;
+ /* synchronize cpus_allowed to cpu_active_mask */
+ if (cpus_updated) {
+ mutex_lock(&callback_mutex);
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ mutex_unlock(&callback_mutex);
+ /* we don't mess with cpumasks of tasks in top_cpuset */
+ }
- case CPUSET_MEM_OFFLINE:
- while ((cp = cpuset_next(&queue)) != NULL) {
+ /* synchronize mems_allowed to N_MEMORY */
+ if (mems_updated) {
+ tmp_mems = top_cpuset.mems_allowed;
+ mutex_lock(&callback_mutex);
+ top_cpuset.mems_allowed = new_mems;
+ mutex_unlock(&callback_mutex);
+ update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+ }
- /* Continue past cpusets with all mems online */
- if (nodes_subset(cp->mems_allowed,
- node_states[N_MEMORY]))
- continue;
+ /* if cpus or mems went down, we need to propagate to descendants */
+ if (cpus_offlined || mems_offlined) {
+ struct cpuset *cs;
+ struct cgroup *pos_cgrp;
- oldmems = cp->mems_allowed;
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+ schedule_cpuset_propagate_hotplug(cs);
+ rcu_read_unlock();
+ }
- /* Remove offline mems from this cpuset. */
- mutex_lock(&callback_mutex);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
- node_states[N_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else
- update_tasks_nodemask(cp, &oldmems, NULL);
- }
- }
+ /* wait for propagations to finish */
+ flush_workqueue(cpuset_propagate_hotplug_wq);
+
+ /* rebuild sched domains if cpus_allowed has changed */
+ if (cpus_updated)
+ rebuild_sched_domains();
}
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period. This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus(). Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
void cpuset_update_active_cpus(bool cpu_online)
{
- struct sched_domain_attr *attr;
- cpumask_var_t *doms;
- int ndoms;
-
- cgroup_lock();
- mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- mutex_unlock(&callback_mutex);
-
- if (!cpu_online)
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
-
- ndoms = generate_sched_domains(&doms, &attr);
- cgroup_unlock();
-
- /* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
+ /*
+ * We're inside cpu hotplug critical region which usually nests
+ * inside cgroup synchronization. Bounce actual hotplug processing
+ * to a work item to avoid reverse locking order.
+ *
+ * We still need to do partition_sched_domains() synchronously;
+ * otherwise, the scheduler will get confused and put tasks to the
+ * dead CPU. Fall back to the default single domain.
+ * cpuset_hotplug_workfn() will rebuild it as necessary.
+ */
+ partition_sched_domains(1, NULL, NULL);
+ schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2133,48 +2201,30 @@ void cpuset_update_active_cpus(bool cpu_online)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- static nodemask_t oldmems; /* protected by cgroup_mutex */
-
- cgroup_lock();
- switch (action) {
- case MEM_ONLINE:
- oldmems = top_cpuset.mems_allowed;
- mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
- mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
- break;
- case MEM_OFFLINE:
- /*
- * needn't update top_cpuset.mems_allowed explicitly because
- * scan_cpusets_upon_hotplug() will update it.
- */
- scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
- break;
- default:
- break;
- }
- cgroup_unlock();
-
+ schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
- cpuset_wq = create_singlethread_workqueue("cpuset");
- BUG_ON(!cpuset_wq);
+ cpuset_propagate_hotplug_wq =
+ alloc_ordered_workqueue("cpuset_hotplug", 0);
+ BUG_ON(!cpuset_propagate_hotplug_wq);
}
/**
@@ -2273,8 +2323,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
*/
static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
{
- while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
- cs = cs->parent;
+ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
+ cs = parent_cs(cs);
return cs;
}
@@ -2412,17 +2462,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-
-void cpuset_unlock(void)
-{
- mutex_unlock(&callback_mutex);
-}
-
-/**
* cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page
*
@@ -2497,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
+#define CPUSET_NODELIST_LEN (256)
+
/**
* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
* @task: pointer to task_struct of some task.
@@ -2507,17 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
*/
void cpuset_print_task_mems_allowed(struct task_struct *tsk)
{
- struct dentry *dentry;
+ /* Statically allocated to prevent using excess stack. */
+ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+ static DEFINE_SPINLOCK(cpuset_buffer_lock);
+
+ struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
- dentry = task_cs(tsk)->css.cgroup->dentry;
+ rcu_read_lock();
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
- tsk->comm, cpuset_name, cpuset_nodelist);
+ tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+
spin_unlock(&cpuset_buffer_lock);
+ rcu_read_unlock();
}
/*
@@ -2560,10 +2606,10 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
@@ -2582,35 +2628,21 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk)
goto out_free;
- retval = -EINVAL;
- cgroup_lock();
+ rcu_read_lock();
css = task_subsys_state(tsk, cpuset_subsys_id);
retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+ rcu_read_unlock();
if (retval < 0)
- goto out_unlock;
+ goto out_put_task;
seq_puts(m, buf);
seq_putc(m, '\n');
-out_unlock:
- cgroup_unlock();
+out_put_task:
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..0506d447aed2 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
*/
#include <linux/pid_namespace.h>
#include <linux/clocksource.h>
+#include <linux/serial_core.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/console.h>
@@ -774,7 +775,7 @@ static void sysrq_handle_dbg(int key)
static struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
- .help_msg = "debug(G)",
+ .help_msg = "debug(g)",
.action_msg = "DEBUG",
};
#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
#ifdef CONFIG_KGDB_KDB
extern int kdb_stub(struct kgdb_state *ks);
extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
#else /* ! CONFIG_KGDB_KDB */
static inline int kdb_stub(struct kgdb_state *ks)
{
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
#include <linux/kernel.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/serial_core.h>
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
len = len / 2;
remcom_out_buffer[len++] = 0;
+ kdb_common_init_state(ks);
kdb_parse(remcom_out_buffer);
+ kdb_common_deinit_state();
+
strcpy(remcom_out_buffer, "OK");
}
break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
/*
* kdb_ss
*
- * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
- * commands.
+ * Process the 'ss' (Single Step) command.
*
* ss
- * ssb
*
* Parameters:
* argc Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
* Outputs:
* None.
* Returns:
- * KDB_CMD_SS[B] for success, a kdb error if failure.
+ * KDB_CMD_SS for success, a kdb error if failure.
* Locking:
* None.
* Remarks:
*
* Set the arch specific option to trigger a debug trap after the next
* instruction.
- *
- * For 'ssb', set the trace flag in the debug trap handler
- * after printing the current insn and return directly without
- * invoking the kdb command processor, until a branch instruction
- * is encountered.
*/
static int kdb_ss(int argc, const char **argv)
{
- int ssb = 0;
-
- ssb = (strcmp(argv[0], "ssb") == 0);
if (argc != 0)
return KDB_ARGCOUNT;
/*
* Set trace flag and go.
*/
KDB_STATE_SET(DOING_SS);
- if (ssb) {
- KDB_STATE_SET(DOING_SSB);
- return KDB_CMD_SSB;
- }
return KDB_CMD_SS;
}
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
kdb_register_repeat("ss", kdb_ss, "",
"Single Step", 1, KDB_REPEAT_NO_ARGS);
- kdb_register_repeat("ssb", kdb_ss, "",
- "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
/*
* Architecture dependent initialization.
*/
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+ kdb_initial_cpu = atomic_read(&kgdb_active);
+ kdb_current_task = kgdb_info[ks->cpu].task;
+ kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ return 0;
+}
+
+int kdb_common_deinit_state(void)
+{
+ kdb_initial_cpu = -1;
+ kdb_current_task = NULL;
+ kdb_current_regs = NULL;
+ return 0;
+}
+
int kdb_stub(struct kgdb_state *ks)
{
int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = atomic_read(&kgdb_active);
- kdb_current_task = kgdb_info[ks->cpu].task;
- kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+ kdb_common_init_state(ks);
/* Remove any breakpoints as needed by kdb and clear single step */
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
- KDB_STATE_CLEAR(DOING_SSB);
KDB_STATE_SET(PAGER);
/* zero out any offline cpu data */
for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
* Upon exit from the kdb main loop setup break points and restart
* the system based on the requested continue state
*/
- kdb_initial_cpu = -1;
- kdb_current_task = NULL;
- kdb_current_regs = NULL;
+ kdb_common_deinit_state();
KDB_STATE_CLEAR(PAGER);
kdbnearsym_cleanup();
if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 8875254120b6..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
};
#undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
/*
@@ -175,7 +175,7 @@ static char *__env[] = {
(char *)0,
};
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
struct task_struct *kdb_curr_task(int cpu)
{
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
}
if (argc != 3)
return KDB_ARGCOUNT;
- defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
- GFP_KDB);
- if (!defcmd_set) {
- kdb_printf("Could not allocate new defcmd_set entry for %s\n",
- argv[1]);
- defcmd_set = save_defcmd_set;
+ if (in_dbg_master()) {
+ kdb_printf("Command only available during kdb_init()\n");
return KDB_NOTIMP;
}
+ defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+ GFP_KDB);
+ if (!defcmd_set)
+ goto fail_defcmd;
memcpy(defcmd_set, save_defcmd_set,
defcmd_set_count * sizeof(*defcmd_set));
- kfree(save_defcmd_set);
s = defcmd_set + defcmd_set_count;
memset(s, 0, sizeof(*s));
s->usable = 1;
s->name = kdb_strdup(argv[1], GFP_KDB);
+ if (!s->name)
+ goto fail_name;
s->usage = kdb_strdup(argv[2], GFP_KDB);
+ if (!s->usage)
+ goto fail_usage;
s->help = kdb_strdup(argv[3], GFP_KDB);
+ if (!s->help)
+ goto fail_help;
if (s->usage[0] == '"') {
- strcpy(s->usage, s->usage+1);
+ strcpy(s->usage, argv[2]+1);
s->usage[strlen(s->usage)-1] = '\0';
}
if (s->help[0] == '"') {
- strcpy(s->help, s->help+1);
+ strcpy(s->help, argv[3]+1);
s->help[strlen(s->help)-1] = '\0';
}
++defcmd_set_count;
defcmd_in_progress = 1;
+ kfree(save_defcmd_set);
return 0;
+fail_help:
+ kfree(s->usage);
+fail_usage:
+ kfree(s->name);
+fail_name:
+ kfree(defcmd_set);
+fail_defcmd:
+ kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+ defcmd_set = save_defcmd_set;
+ return KDB_NOTIMP;
}
/*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
* KDB_CMD_GO User typed 'go'.
* KDB_CMD_CPU User switched to another cpu.
* KDB_CMD_SS Single step.
- * KDB_CMD_SSB Single step until branch.
*/
static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
instruction_pointer(regs));
break;
- case KDB_DB_SSB:
- /*
- * In the midst of ssb command. Just return.
- */
- KDB_DEBUG_STATE("kdb_local 3", reason);
- return KDB_CMD_SSB; /* Continue with SSB command */
-
- break;
case KDB_DB_SS:
break;
case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
if (diag == KDB_CMD_GO
|| diag == KDB_CMD_CPU
|| diag == KDB_CMD_SS
- || diag == KDB_CMD_SSB
|| diag == KDB_CMD_KGDB)
break;
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
break;
}
- if (result == KDB_CMD_SSB) {
- KDB_STATE_SET(DOING_SS);
- KDB_STATE_SET(DOING_SSB);
- break;
- }
-
if (result == KDB_CMD_KGDB) {
if (!KDB_STATE(DOING_KGDB))
kdb_printf("Entering please attach debugger "
@@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
return 0;
}
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- * linked list and executes an arbitrary command for each
- * element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
- int diag = 0;
- unsigned long addr;
- long offset = 0;
- unsigned long va;
- unsigned long linkoffset;
- int nextarg;
- const char *command;
-
- if (argc != 3)
- return KDB_ARGCOUNT;
-
- nextarg = 1;
- diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
- if (diag)
- return diag;
-
- diag = kdbgetularg(argv[2], &linkoffset);
- if (diag)
- return diag;
-
- /*
- * Using the starting address as
- * the first element in the list, and assuming that
- * the list ends with a null pointer.
- */
-
- va = addr;
- command = kdb_strdup(argv[3], GFP_KDB);
- if (!command) {
- kdb_printf("%s: cannot duplicate command\n", __func__);
- return 0;
- }
- /* Recursive use of kdb_parse, do not use argv after this point */
- argv = NULL;
-
- while (va) {
- char buf[80];
-
- if (KDB_FLAG(CMD_INTERRUPT))
- goto out;
-
- sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
- diag = kdb_parse(buf);
- if (diag)
- goto out;
-
- addr = va + linkoffset;
- if (kdb_getword(&va, addr, sizeof(va)))
- goto out;
- }
-
-out:
- kfree(command);
- return diag;
-}
-
static int kdb_kgdb(int argc, const char **argv)
{
return KDB_CMD_KGDB;
@@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
kdb_printf("-----------------------------"
"-----------------------------\n");
for_each_kdbcmd(kt, i) {
- if (kt->cmd_name)
- kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
- kt->cmd_usage, kt->cmd_help);
+ char *space = "";
if (KDB_FLAG(CMD_INTERRUPT))
return 0;
+ if (!kt->cmd_name)
+ continue;
+ if (strlen(kt->cmd_usage) > 20)
+ space = "\n ";
+ kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+ kt->cmd_usage, space, kt->cmd_help);
}
return 0;
}
@@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
(kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
kfree(kdb_commands);
}
- memset(new + kdb_max_commands, 0,
+ memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
kdb_command_extend * sizeof(*new));
kdb_commands = new;
kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void)
"Stack traceback", 1, KDB_REPEAT_NONE);
kdb_register_repeat("btp", kdb_bt, "<pid>",
"Display stack for process <pid>", 0, KDB_REPEAT_NONE);
- kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
- "Display stack all processes", 0, KDB_REPEAT_NONE);
+ kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
+ "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btc", kdb_bt, "",
"Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
kdb_register_repeat("btt", kdb_bt, "<vaddr>",
"Backtrace process given its struct task address", 0,
KDB_REPEAT_NONE);
- kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
- "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
kdb_register_repeat("env", kdb_env, "",
"Show environment variables", 0, KDB_REPEAT_NONE);
kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
#define KDB_CMD_SS (-1003)
-#define KDB_CMD_SSB (-1004)
#define KDB_CMD_KGDB (-1005)
/* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
* kdb control */
#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
-#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
- * DOING_SS is also set */
#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
* after one ss, independent of
* DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
typedef enum {
KDB_DB_BPT, /* Breakpoint */
KDB_DB_SS, /* Single-step trap */
- KDB_DB_SSB, /* Single step to branch */
KDB_DB_SSBPT, /* Single step over breakpoint */
KDB_DB_NOBPT /* Spurious breakpoint */
} kdb_dbtrap_t;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
unsigned long long t2, t3;
unsigned long flags;
struct timespec ts;
+ cputime_t utime, stime, stimescaled, utimescaled;
/* Though tsk->delays accessed later, early exit avoids
* unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
goto done;
tmp = (s64)d->cpu_run_real_total;
- cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ task_cputime(tsk, &utime, &stime);
+ cputime_to_timespec(utime + stime, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
tmp = (s64)d->cpu_scaled_run_real_total;
- cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+ task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+ cputime_to_timespec(utimescaled + stimescaled, &ts);
tmp += timespec_to_ns(&ts);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a8c067..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -37,6 +38,7 @@
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
+#include <linux/cgroup.h>
#include "internal.h"
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
@@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- return !event->cgrp || event->cgrp == cpuctx->cgrp;
+ /* @event doesn't care about cgroup */
+ if (!event->cgrp)
+ return true;
+
+ /* wants specific cgroup scope but @cpuctx isn't associated with any */
+ if (!cpuctx->cgrp)
+ return false;
+
+ /*
+ * Cgroup scoping is recursive. An event enabled for a cgroup is
+ * also enabled for all its descendant cgroups. If @cpuctx's
+ * cgroup is a descendant of @event's (the test covers identity
+ * case), it's a match.
+ */
+ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+ event->cgrp->css.cgroup);
}
static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
+ if (list_empty(&cpuctx->rotation_list)) {
+ int was_empty = list_empty(head);
list_add(&cpuctx->rotation_list, head);
+ if (was_empty)
+ tick_nohz_full_kick();
+ }
}
static void get_ctx(struct perf_event_context *ctx)
@@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
event->header_size = size;
}
@@ -2555,6 +2596,16 @@ done:
list_del_init(&cpuctx->rotation_list);
}
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (list_empty(&__get_cpu_var(rotation_list)))
+ return true;
+ else
+ return false;
+}
+#endif
+
void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
@@ -3691,7 +3742,7 @@ unlock:
static int perf_fasync(int fd, struct file *filp, int on)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
@@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4434,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_task_ctx(ctx, task_event);
}
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
+ if (task_event->task_ctx)
+ perf_event_task_ctx(task_event->task_ctx, task_event);
+
rcu_read_unlock();
}
@@ -4593,6 +4653,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
+ rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4600,6 +4661,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
+ rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4734,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
} else {
if (arch_vma_name(mmap_event->vma)) {
name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
+ sizeof(tmp) - 1);
+ tmp[sizeof(tmp) - 1] = '\0';
goto got_name;
}
@@ -4761,6 +4824,9 @@ got_name:
mmap_event->file_name = name;
mmap_event->file_size = size;
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
rcu_read_lock();
@@ -5126,7 +5192,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
struct perf_event *event;
- struct hlist_node *node;
struct hlist_head *head;
rcu_read_lock();
@@ -5134,7 +5199,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
if (!head)
goto end;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
@@ -5328,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5419,7 +5484,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
{
struct perf_sample_data data;
struct perf_event *event;
- struct hlist_node *node;
struct perf_raw_record raw = {
.size = entry_size,
@@ -5429,7 +5493,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
- hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -5649,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5965,13 +6030,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
pmu->name = name;
if (type < 0) {
- int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
- if (!err)
- goto free_pdc;
-
- err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
- if (err) {
- ret = err;
+ type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
+ if (type < 0) {
+ ret = type;
goto free_pdc;
}
}
@@ -5988,6 +6049,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
@@ -6171,11 +6233,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) {
event->attach_state = PERF_ATTACH_TASK;
+
+ if (attr->type == PERF_TYPE_TRACEPOINT)
+ event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
- if (attr->type == PERF_TYPE_BREAKPOINT)
+ else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}
@@ -7512,12 +7577,5 @@ struct cgroup_subsys perf_subsys = {
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
-
- /*
- * perf_event cgroup doesn't handle nesting correctly.
- * ctx->nr_cgroups adjustments should be propagated through the
- * cgroup hierarchy. Fix it and remove the following.
- */
- .broken_hierarchy = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
for (i = 0; i < TYPE_MAX; i++)
- kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+ kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
if (err_cpu == cpu)
break;
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..cd55144270b5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
unsigned long offset, unsigned long head)
{
- unsigned long mask;
+ unsigned long sz = perf_data_size(rb);
+ unsigned long mask = sz - 1;
- if (!rb->writable)
+ /*
+ * check if user-writable
+ * overwrite : over-write its own tail
+ * !overwrite: buffer possibly drops events.
+ */
+ if (rb->overwrite)
return true;
- mask = perf_data_size(rb) - 1;
+ /*
+ * verify that payload is not bigger than buffer
+ * otherwise masking logic may fail to detect
+ * the "not enough space" condition
+ */
+ if ((head - offset) > sz)
+ return false;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
static struct rb_root uprobes_tree = RB_ROOT;
-
-static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
-
-#define UPROBES_HASH_SZ 13
-
/*
- * We need separate register/unregister and mmap/munmap lock hashes because
- * of mmap_sem nesting.
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
+ * at this time. Probably a fine grained per inode count is better?
*/
+#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
-/* serialize (un)register */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-
-#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
+#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
static struct percpu_rw_semaphore dup_mmap_sem;
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time. Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
-
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER 1
/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 2
+#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
+ struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
- struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
@@ -101,6 +75,15 @@ struct uprobe {
struct arch_uprobe arch;
};
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -199,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
- memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
kunmap_atomic(kaddr);
}
@@ -211,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
uprobe_opcode_t old_opcode;
bool is_swbp;
- copy_opcode(page, vaddr, &old_opcode);
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
@@ -230,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
@@ -251,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
- void *vaddr_old, *vaddr_new;
struct vm_area_struct *vma;
int ret;
@@ -272,15 +284,8 @@ retry:
__SetPageUptodate(new_page);
- /* copy the page now that we've got it stable */
- vaddr_old = kmap_atomic(old_page);
- vaddr_new = kmap_atomic(new_page);
-
- memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
- memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-
- kunmap_atomic(vaddr_new);
- kunmap_atomic(vaddr_old);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = anon_vma_prepare(vma);
if (ret)
@@ -430,9 +435,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-
return u;
}
@@ -452,8 +454,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
+ init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- mutex_init(&uprobe->copy_mutex);
+ /* For now assume that the instruction need not be single-stepped */
+ __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +467,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
kfree(uprobe);
uprobe = cur_uprobe;
iput(inode);
- } else {
- atomic_inc(&uprobe_events);
}
return uprobe;
}
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
-{
- struct uprobe_consumer *uc;
-
- if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
- return;
-
- down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- if (!uc->filter || uc->filter(uc, current))
- uc->handler(uc, regs);
- }
- up_read(&uprobe->consumer_rwsem);
-}
-
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
up_write(&uprobe->consumer_rwsem);
-
- return uc->next;
}
/*
@@ -525,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
unsigned long nbytes, loff_t offset)
{
struct page *page;
- void *vaddr;
- unsigned long off;
- pgoff_t idx;
-
- if (!filp)
- return -EINVAL;
if (!mapping->a_ops->readpage)
return -EIO;
-
- idx = offset >> PAGE_CACHE_SHIFT;
- off = offset & ~PAGE_MASK;
-
/*
* Ensure that the page that has the original instruction is
* populated and in page-cache.
*/
- page = read_mapping_page(mapping, idx, filp);
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
if (IS_ERR(page))
return PTR_ERR(page);
- vaddr = kmap_atomic(page);
- memcpy(insn, vaddr + off, nbytes);
- kunmap_atomic(vaddr);
+ copy_from_page(page, offset, insn, nbytes);
page_cache_release(page);
return 0;
@@ -588,7 +559,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
return ret;
- mutex_lock(&uprobe->copy_mutex);
+ /* TODO: move this into _register, until then we abuse this sem. */
+ down_write(&uprobe->consumer_rwsem);
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
goto out;
@@ -597,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
goto out;
ret = -ENOTSUPP;
- if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -612,7 +584,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
- mutex_unlock(&uprobe->copy_mutex);
+ up_write(&uprobe->consumer_rwsem);
+
+ return ret;
+}
+
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ return !uc->filter || uc->filter(uc, ctx, mm);
+}
+
+static bool filter_chain(struct uprobe *uprobe,
+ enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+ struct uprobe_consumer *uc;
+ bool ret = false;
+
+ down_read(&uprobe->consumer_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ ret = consumer_filter(uc, ctx, mm);
+ if (ret)
+ break;
+ }
+ up_read(&uprobe->consumer_rwsem);
return ret;
}
@@ -624,16 +619,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
bool first_uprobe;
int ret;
- /*
- * If probe is being deleted, unregister thread could be done with
- * the vma-rmap-walk through. Adding a probe now can be fatal since
- * nobody will be able to cleanup. Also we could be from fork or
- * mremap path, where the probe might have already been inserted.
- * Hence behave as if probe already existed.
- */
- if (!uprobe->consumers)
- return 0;
-
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
if (ret)
return ret;
@@ -658,14 +643,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- /* can happen if uprobe_register() fails */
- if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
- return 0;
-
set_bit(MMF_RECALC_UPROBES, &mm->flags);
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +658,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
+ if (WARN_ON(!uprobe_is_active(uprobe)))
+ return;
+
spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
+ RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
iput(uprobe->inode);
put_uprobe(uprobe);
- atomic_dec(&uprobe_events);
}
struct map_info {
@@ -764,8 +752,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
return curr;
}
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
+ bool is_register = !!new;
struct map_info *info;
int err = 0;
@@ -787,17 +777,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
down_write(&mm->mmap_sem);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
- vma->vm_file->f_mapping->host != uprobe->inode)
+ file_inode(vma->vm_file) != uprobe->inode)
goto unlock;
if (vma->vm_start > info->vaddr ||
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
- if (is_register)
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- else
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ if (is_register) {
+ /* consult only the "caller", new consumer. */
+ if (consumer_filter(new,
+ UPROBE_FILTER_REGISTER, mm))
+ err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+ if (!filter_chain(uprobe,
+ UPROBE_FILTER_UNREGISTER, mm))
+ err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ }
unlock:
up_write(&mm->mmap_sem);
@@ -810,17 +806,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
return err;
}
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- return register_for_each_vma(uprobe, true);
+ consumer_add(uprobe, uc);
+ return register_for_each_vma(uprobe, uc);
}
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- if (!register_for_each_vma(uprobe, false))
- delete_uprobe(uprobe);
+ int err;
+ if (!consumer_del(uprobe, uc)) /* WARN? */
+ return;
+
+ err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
+ if (!uprobe->consumers && !err)
+ delete_uprobe(uprobe);
}
/*
@@ -845,31 +847,63 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
- if (!inode || !uc || uc->next)
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
return -EINVAL;
+ /* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
- ret = 0;
- mutex_lock(uprobes_hash(inode));
+ retry:
uprobe = alloc_uprobe(inode, offset);
-
- if (!uprobe) {
- ret = -ENOMEM;
- } else if (!consumer_add(uprobe, uc)) {
- ret = __uprobe_register(uprobe);
- if (ret) {
- uprobe->consumers = NULL;
- __uprobe_unregister(uprobe);
- } else {
- set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+ if (!uprobe)
+ return -ENOMEM;
+ /*
+ * We can race with uprobe_unregister()->delete_uprobe().
+ * Check uprobe_is_active() and retry if it is false.
+ */
+ down_write(&uprobe->register_rwsem);
+ ret = -EAGAIN;
+ if (likely(uprobe_is_active(uprobe))) {
+ ret = __uprobe_register(uprobe, uc);
+ if (ret)
+ __uprobe_unregister(uprobe, uc);
}
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ if (unlikely(ret == -EAGAIN))
+ goto retry;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+ struct uprobe_consumer *uc, bool add)
+{
+ struct uprobe *uprobe;
+ struct uprobe_consumer *con;
+ int ret = -ENOENT;
+
+ uprobe = find_uprobe(inode, offset);
+ if (!uprobe)
+ return ret;
+
+ down_write(&uprobe->register_rwsem);
+ for (con = uprobe->consumers; con && con != uc ; con = con->next)
+ ;
+ if (con)
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
return ret;
}
@@ -884,25 +918,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
{
struct uprobe *uprobe;
- if (!inode || !uc)
- return;
-
uprobe = find_uprobe(inode, offset);
if (!uprobe)
return;
- mutex_lock(uprobes_hash(inode));
+ down_write(&uprobe->register_rwsem);
+ __uprobe_unregister(uprobe, uc);
+ up_write(&uprobe->register_rwsem);
+ put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
- if (consumer_del(uprobe, uc)) {
- if (!uprobe->consumers) {
- __uprobe_unregister(uprobe);
- clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
- }
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err = 0;
+
+ down_read(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long vaddr;
+ loff_t offset;
+
+ if (!valid_vma(vma, false) ||
+ file_inode(vma->vm_file) != uprobe->inode)
+ continue;
+
+ offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+ if (uprobe->offset < offset ||
+ uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+ continue;
+
+ vaddr = offset_to_vaddr(vma, uprobe->offset);
+ err |= remove_breakpoint(uprobe, mm, vaddr);
}
+ up_read(&mm->mmap_sem);
- mutex_unlock(uprobes_hash(inode));
- if (uprobe)
- put_uprobe(uprobe);
+ return err;
}
static struct rb_node *
@@ -979,18 +1030,23 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+ if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
if (!inode)
return 0;
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
-
+ /*
+ * We can race with uprobe_unregister(), this uprobe can be already
+ * removed. But in this case filter_chain() must return false, all
+ * consumers have gone away.
+ */
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1008,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
struct inode *inode;
struct rb_node *n;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
@@ -1025,7 +1081,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
- if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+ if (no_uprobe_events() || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1098,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct xol_area *area)
{
- struct mm_struct *mm;
- int ret;
-
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
- return -ENOMEM;
-
- ret = -EALREADY;
- mm = current->mm;
+ struct mm_struct *mm = current->mm;
+ int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
ret = -ENOMEM;
-
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1121,59 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
ret = 0;
-
-fail:
+ fail:
up_write(&mm->mmap_sem);
- if (ret)
- __free_page(area->page);
return ret;
}
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
- struct xol_area *area;
-
- area = mm->uprobes_state.xol_area;
- smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
-
- return area;
-}
-
/*
- * xol_alloc_area - Allocate process's xol_area.
- * This area will be used for storing instructions for execution out of
- * line.
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
*
* Returns the allocated area or NULL.
*/
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
{
+ struct mm_struct *mm = current->mm;
struct xol_area *area;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+
+ area = mm->uprobes_state.xol_area;
+ if (area)
+ goto ret;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
- return NULL;
+ goto out;
area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
-
if (!area->bitmap)
- goto fail;
+ goto free_area;
+
+ area->page = alloc_page(GFP_HIGHUSER);
+ if (!area->page)
+ goto free_bitmap;
+ /* allocate first slot of task's xol_area for the return probes */
+ set_bit(0, area->bitmap);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ atomic_set(&area->slot_count, 1);
init_waitqueue_head(&area->wq);
+
if (!xol_add_vma(area))
return area;
-fail:
+ __free_page(area->page);
+ free_bitmap:
kfree(area->bitmap);
+ free_area:
kfree(area);
-
- return get_xol_area(current->mm);
+ out:
+ area = mm->uprobes_state.xol_area;
+ ret:
+ smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
+ return area;
}
/*
@@ -1186,43 +1239,31 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
}
/*
- * xol_get_insn_slot - If was not allocated a slot, then
- * allocate a slot.
+ * xol_get_insn_slot - allocate a slot for xol.
* Returns the allocated slot address or 0.
*/
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
- unsigned long offset;
- void *vaddr;
+ unsigned long xol_vaddr;
- area = get_xol_area(current->mm);
- if (!area) {
- area = xol_alloc_area();
- if (!area)
- return 0;
- }
- current->utask->xol_vaddr = xol_take_insn_slot(area);
+ area = get_xol_area();
+ if (!area)
+ return 0;
- /*
- * Initialize the slot if xol_vaddr points to valid
- * instruction slot.
- */
- if (unlikely(!current->utask->xol_vaddr))
+ xol_vaddr = xol_take_insn_slot(area);
+ if (unlikely(!xol_vaddr))
return 0;
- current->utask->vaddr = slot_addr;
- offset = current->utask->xol_vaddr & ~PAGE_MASK;
- vaddr = kmap_atomic(area->page);
- memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
- kunmap_atomic(vaddr);
+ /* Initialize the slot */
+ copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
*/
flush_dcache_page(area->page);
- return current->utask->xol_vaddr;
+ return xol_vaddr;
}
/*
@@ -1240,8 +1281,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
return;
slot_addr = tsk->utask->xol_vaddr;
-
- if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
+ if (unlikely(!slot_addr))
return;
area = tsk->mm->uprobes_state.xol_area;
@@ -1282,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
if (!utask)
return;
@@ -1289,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
if (utask->active_uprobe)
put_uprobe(utask->active_uprobe);
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
@@ -1303,33 +1353,135 @@ void uprobe_copy_process(struct task_struct *t)
}
/*
- * Allocate a uprobe_task object for the task.
- * Called when the thread hits a breakpoint for the first time.
+ * Allocate a uprobe_task object for the task if if necessary.
+ * Called when the thread hits a breakpoint.
*
* Returns:
* - pointer to new uprobe_task on success
* - NULL otherwise
*/
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
{
+ if (!current->utask)
+ current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+ return current->utask;
+}
+
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
- utask = kzalloc(sizeof *utask, GFP_KERNEL);
- if (unlikely(!utask))
- return NULL;
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
- current->utask = utask;
- return utask;
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
}
/* Prepare to single-step probed instruction out of line. */
static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
- if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
- return 0;
+ struct uprobe_task *utask;
+ unsigned long xol_vaddr;
+ int err;
- return -EFAULT;
+ utask = get_utask();
+ if (!utask)
+ return -ENOMEM;
+
+ xol_vaddr = xol_get_insn_slot(uprobe);
+ if (!xol_vaddr)
+ return -ENOMEM;
+
+ utask->xol_vaddr = xol_vaddr;
+ utask->vaddr = bp_vaddr;
+
+ err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+ if (unlikely(err)) {
+ xol_free_insn_slot(current);
+ return err;
+ }
+
+ utask->active_uprobe = uprobe;
+ utask->state = UTASK_SSTEP;
+ return 0;
}
/*
@@ -1391,6 +1543,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
* This is not strictly accurate, we can race with
* uprobe_unregister() and see the already removed
* uprobe if delete_uprobe() was not yet called.
+ * Or this uprobe can be filtered out.
*/
if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
return;
@@ -1399,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
struct page *page;
uprobe_opcode_t opcode;
@@ -1417,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
- copy_opcode(page, vaddr, &opcode);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
- return is_swbp_insn(&opcode);
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1433,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
- struct inode *inode = vma->vm_file->f_mapping->host;
+ struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
}
if (!uprobe)
- *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
} else {
*is_swbp = -EFAULT;
}
@@ -1452,20 +1606,116 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
return uprobe;
}
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct uprobe_consumer *uc;
+ int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
+
+ remove &= rc;
+ }
+
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
+ if (remove && uprobe->consumers) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+
+ if (!chained)
+ break;
+
+ utask->depth--;
+
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
- struct uprobe_task *utask;
struct uprobe *uprobe;
unsigned long bp_vaddr;
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -1483,6 +1733,10 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
+
+ /* change it in advance for ->handler() and restart */
+ instruction_pointer_set(regs, bp_vaddr);
+
/*
* TODO: move copy_insn/etc into _register and remove this hack.
* After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1744,16 @@ static void handle_swbp(struct pt_regs *regs)
*/
smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
- goto restart;
-
- utask = current->utask;
- if (!utask) {
- utask = add_utask();
- /* Cannot allocate; re-execute the instruction. */
- if (!utask)
- goto restart;
- }
+ goto out;
handler_chain(uprobe, regs);
if (can_skip_sstep(uprobe, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr)) {
- utask->active_uprobe = uprobe;
- utask->state = UTASK_SSTEP;
+ if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- }
-restart:
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
+ /* can_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1576,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
- if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
return 0;
set_thread_flag(TIF_UPROBE);
@@ -1609,10 +1851,8 @@ static int __init init_uprobes(void)
{
int i;
- for (i = 0; i < UPROBES_HASH_SZ; i++) {
- mutex_init(&uprobes_mutex[i]);
+ for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- }
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df21937216..af2eb3cbd499 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
-#include <linux/freezer.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *uninitialized_var(tty);
+ cputime_t utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime += tsk->utime;
- sig->stime += tsk->stime;
- sig->gtime += tsk->gtime;
+ task_cputime(tsk, &utime, &stime);
+ sig->utime += utime;
+ sig->stime += stime;
+ sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -483,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
- schedule();
+ freezable_schedule();
}
__set_task_state(tsk, TASK_RUNNING);
down_read(&mm->mmap_sem);
@@ -845,7 +847,7 @@ void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
@@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
sig = p->signal;
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
- psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+ psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1627,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1667,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf76..67460b93b1a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- if (main_extable_sort_needed)
+ if (main_extable_sort_needed) {
+ pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
- else
- pr_notice("__ex_table already sorted, skipping sort\n");
+ }
}
/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33bbb9c..987b28a1f01b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -413,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = file->f_mapping;
get_file(file);
@@ -1141,6 +1142,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1230,9 +1234,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqlock_init(&p->vtime_seqlock);
+ p->vtime_snap = 0;
+ p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
+
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif
@@ -1294,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
@@ -1668,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int, tls_val)
#endif
{
- long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
- asmlinkage_protect(5, ret, clone_flags, newsp,
- parent_tidptr, child_tidptr, tls_val);
- return ret;
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif
@@ -1801,7 +1812,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* If unsharing a user namespace must also unshare the thread.
*/
if (unshare_flags & CLONE_NEWUSER)
- unshare_flags |= CLONE_THREAD;
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a pid namespace must also unshare the thread.
*/
@@ -1855,10 +1866,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
exit_sem(current);
}
- if (new_nsproxy) {
+ if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
- new_nsproxy = NULL;
- }
task_lock(current);
@@ -1888,9 +1897,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
- if (new_nsproxy)
- put_nsproxy(new_nsproxy);
-
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
+#include <linux/sched/rt.h>
#include <asm/futex.h>
@@ -222,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
@@ -704,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -1190,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * 1 - acquired the lock;
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1253,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1535,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1706,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1823,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2202,9 +2204,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2246,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2257,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2275,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
#include <linux/nsproxy.h>
#include <linux/futex.h>
#include <linux/ptrace.h>
+#include <linux/syscalls.h>
#include <asm/uaccess.h>
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
}
}
-asmlinkage long
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
- compat_size_t len)
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
{
if (!futex_cmpxchg_enabled)
return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
return 0;
}
-asmlinkage long
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
- compat_size_t __user *len_ptr)
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
{
struct compat_robust_list_head __user *head;
unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
if (!futex_cmpxchg_enabled)
return -ENOSYS;
- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
-
rcu_read_lock();
ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
return ret;
}
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
- struct compat_timespec __user *utime, u32 __user *uaddr2,
- u32 val3)
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
{
struct timespec ts;
ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..fd4b13b131f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
#include <linux/timer.h>
#include <asm/uaccess.h>
@@ -61,6 +63,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -81,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
+ {
+ .index = HRTIMER_BASE_TAI,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ .resolution = KTIME_LOW_RES,
+ },
}
};
@@ -88,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+ [CLOCK_TAI] = HRTIMER_BASE_TAI,
};
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -104,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
ktime_t xtim, mono, boot;
struct timespec xts, tom, slp;
+ s32 tai_offset;
get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+ tai_offset = timekeeping_get_tai_offset();
xtim = timespec_to_ktime(xts);
mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -113,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+ ktime_add(xtim, ktime_set(tai_offset, 0));
}
/*
@@ -158,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
*/
static int hrtimer_get_target(int this_cpu, int pinned)
{
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
return get_nohz_timer_target();
#endif
@@ -273,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
} else {
unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+ /* Make sure nsec fits into long */
+ if (unlikely(nsec > KTIME_SEC_MAX))
+ return (ktime_t){ .tv64 = KTIME_MAX };
+
tmp = ktime_set((long)nsec, rem);
}
@@ -640,29 +658,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
- if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- if (wakeup) {
- raw_spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- raw_spin_lock(&base->cpu_base->lock);
- } else
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-
- return 1;
- }
-
- return 0;
+ return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
}
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets(offs_real, offs_boot);
+ return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
}
/*
@@ -735,8 +742,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -995,8 +1001,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
*
* XXX send_remote_softirq() ?
*/
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+ && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (wakeup) {
+ /*
+ * We need to drop cpu_base->lock to avoid a
+ * lock ordering issue vs. rq->lock.
+ */
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ local_irq_restore(flags);
+ return ret;
+ } else {
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+ }
unlock_hrtimer_base(timer, &flags);
@@ -1008,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1025,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* hrtimer_start - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1104,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
@@ -1307,6 +1328,8 @@ retry:
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
+ if (expires.tv64 < 0)
+ expires.tv64 = KTIME_MAX;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
@@ -1640,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+ struct msi_desc *entry)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
desc->irq_data.msi_desc = entry;
- if (entry)
- entry->irq = irq;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
irq_put_desc_unlock(desc, flags);
return 0;
}
/**
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
+ *
+ * Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+ return irq_set_msi_desc_off(irq, 0, entry);
+}
+
+/**
* irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c379..5a83dde8ca0c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
if (domain->ops->map) {
ret = domain->ops->map(domain, virq, hwirq);
if (ret != 0) {
- pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
- virq, hwirq, ret);
- WARN_ON(1);
+ /*
+ * If map() returns -EPERM, this interrupt is protected
+ * by the firmware or some other service and shall not
+ * be mapped.
+ *
+ * Since on some platforms we blindly try to map everything
+ * we end up with a log full of backtraces.
+ *
+ * So instead, we silently fail on -EPERM, it is the
+ * responsibility of the PIC driver to display a relevant
+ * message if needed.
+ */
+ if (ret != -EPERM) {
+ pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+ virq, hwirq, ret);
+ WARN_ON(1);
+ }
irq_data->domain = NULL;
irq_data->hwirq = 0;
goto err_unmap;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa479..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/task_work.h>
#include "internals.h"
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
out:
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
void disable_percpu_irq(unsigned int irq)
{
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
irq_percpu_disable(desc, cpu);
irq_put_desc_unlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
/*
* Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..19ed5c425c3b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+ unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE(inode)->data);
+ return single_open(file, default_affinity_show, PDE_DATA(inode));
}
static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
static int irq_node_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_node_proc_show, PDE(inode)->data);
+ return single_open(file, irq_node_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
+ return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_spurious_proc_fops = {
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
- if (action->dir) {
- struct irq_desc *desc = irq_to_desc(irq);
-
- remove_proc_entry(action->dir->name, desc->dir);
- }
+ proc_remove(action->dir);
}
static void register_default_affinity_proc(void)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
/*
* All handlers must agree on IRQF_SHARED, so we test just the
- * first. Check for action->next as well.
+ * first.
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER) ||
- (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
- !action->next)
+ (action->flags & __IRQF_TIMER))
goto out;
/* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
ret = IRQ_HANDLED;
+ /* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
#include <asm/processor.h>
-/*
- * An entry can be in one of four states:
- *
- * free NULL, 0 -> {claimed} : free to be used
- * claimed NULL, 3 -> {pending} : claimed to be enqueued
- * pending next, 3 -> {busy} : queued, pending callback
- * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-
-#define IRQ_WORK_PENDING 1UL
-#define IRQ_WORK_BUSY 2UL
-#define IRQ_WORK_FLAGS 3UL
static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
/*
* Claim the entry so that no one else will poke at it.
*/
static bool irq_work_claim(struct irq_work *work)
{
- unsigned long flags, nflags;
+ unsigned long flags, oflags, nflags;
+ /*
+ * Start with our best wish as a premise but only trust any
+ * flag value after cmpxchg() result.
+ */
+ flags = work->flags & ~IRQ_WORK_PENDING;
for (;;) {
- flags = work->flags;
- if (flags & IRQ_WORK_PENDING)
- return false;
nflags = flags | IRQ_WORK_FLAGS;
- if (cmpxchg(&work->flags, flags, nflags) == flags)
+ oflags = cmpxchg(&work->flags, flags, nflags);
+ if (oflags == flags)
break;
+ if (oflags & IRQ_WORK_PENDING)
+ return false;
+ flags = oflags;
cpu_relax();
}
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
}
/*
- * Queue the entry and raise the IPI if needed.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
*/
-static void __irq_work_queue(struct irq_work *work)
+void irq_work_queue(struct irq_work *work)
{
- bool empty;
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return;
+ /* Queue the entry and raise the IPI if needed. */
preempt_disable();
- empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
- /* The list was empty, raise self-interrupt to start processing. */
- if (empty)
- arch_irq_work_raise();
+ llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+
+ /*
+ * If the work is not "lazy" or the tick is stopped, raise the irq
+ * work interrupt (if supported by the arch), otherwise, just wait
+ * for the next tick.
+ */
+ if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+ if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ arch_irq_work_raise();
+ }
preempt_enable();
}
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue(struct irq_work *work)
+bool irq_work_needs_cpu(void)
{
- if (!irq_work_claim(work)) {
- /*
- * Already enqueued, can't do!
- */
+ struct llist_head *this_list;
+
+ this_list = &__get_cpu_var(irq_work_list);
+ if (llist_empty(this_list))
return false;
- }
- __irq_work_queue(work);
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
+static void __irq_work_run(void)
{
+ unsigned long flags;
struct irq_work *work;
struct llist_head *this_list;
struct llist_node *llnode;
+
+ /*
+ * Reset the "raised" state right before we check the list because
+ * an NMI may enqueue after we find the list empty from the runner.
+ */
+ __this_cpu_write(irq_work_raised, 0);
+ barrier();
+
this_list = &__get_cpu_var(irq_work_list);
if (llist_empty(this_list))
return;
- BUG_ON(!in_irq());
BUG_ON(!irqs_disabled());
llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
/*
* Clear the PENDING bit, after this point the @work
* can be re-used.
+ * Make it immediately visible so that other CPUs trying
+ * to claim that work don't rely on us to handle their data
+ * while we are in the middle of the func.
*/
- work->flags = IRQ_WORK_BUSY;
+ flags = work->flags & ~IRQ_WORK_PENDING;
+ xchg(&work->flags, flags);
+
work->func(work);
/*
* Clear the BUSY bit and return to the free state if
* no-one else claimed it meanwhile.
*/
- (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+ (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
}
}
+
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+ BUG_ON(!in_irq());
+ __irq_work_run();
+}
EXPORT_SYMBOL_GPL(irq_work_run);
/*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_DYING:
+ /* Called from stop_machine */
+ if (WARN_ON_ONCE(cpu != smp_processor_id()))
+ break;
+ __irq_work_run();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpu_notify;
+
+static __init int irq_work_init_cpu_notifier(void)
+{
+ cpu_notify.notifier_call = irq_work_cpu_notify;
+ cpu_notify.priority = 0;
+ register_cpu_notifier(&cpu_notify);
+ return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba529..3127ad52cdb2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
/*
* Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
* given the offset to where the symbol is in the compressed stream.
*/
-static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+ char *result, size_t maxlen)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
while (*tptr) {
if (skipped_first) {
+ if (maxlen <= 1)
+ goto tail;
*result = *tptr;
result++;
+ maxlen--;
} else
skipped_first = 1;
tptr++;
}
}
- *result = '\0';
+tail:
+ if (maxlen)
+ *result = '\0';
/* Return to offset to the next symbol. */
return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ symname, KSYM_NAME_LEN);
return 0;
}
/* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ name, KSYM_NAME_LEN);
modname[0] = '\0';
return 0;
}
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
iter->type = kallsyms_get_symbol_type(off);
- off = kallsyms_expand_symbol(off, iter->name);
+ off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
return off - iter->nameoff;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..59f7b55ba745 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
int kexec_should_crash(struct task_struct *p)
{
@@ -223,6 +229,8 @@ out:
}
+static void kimage_free_page_list(struct list_head *list);
+
static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
if (result)
goto out;
- *rimage = image;
-
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
image->swap_page = kimage_alloc_control_pages(image, 0);
if (!image->swap_page) {
printk(KERN_ERR "Could not allocate swap buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
- out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+out:
return result;
}
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out;
+ goto out_free;
}
/*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
- goto out;
+ goto out_free;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ *rimage = image;
+ return 0;
+out_free:
+ kfree(image);
+out:
return result;
}
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
break;
- if (hole_end > crashk_res.end)
- break;
/* See if I overlap any of the segments */
for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
@@ -783,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
struct kexec_segment *segment)
{
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -816,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes)
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
@@ -847,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
* We do things a page at a time for the sake of kmap.
*/
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -868,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes) {
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
@@ -1115,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
{
unsigned long addr;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
- }
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1365,34 +1357,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
return 0;
}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
*/
-int __init parse_crashkernel(char *cmdline,
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
- unsigned long long *crash_base)
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
{
- char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
+ char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, "crashkernel=");
- while (p) {
- ck_cmdline = p;
- p = strstr(p+1, "crashkernel=");
- }
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
- ck_cmdline += 12; /* strlen("crashkernel=") */
+ ck_cmdline += strlen(name);
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1409,6 +1481,36 @@ int __init parse_crashkernel(char *cmdline,
return 0;
}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
static void update_vmcoreinfo_note(void)
{
@@ -1431,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
{
va_list args;
char buf[0x50];
- int r;
+ size_t r;
va_start(args, fmt);
r = vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- if (r + vmcoreinfo_size > vmcoreinfo_max_size)
- r = vmcoreinfo_max_size - vmcoreinfo_size;
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
@@ -1468,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmlist);
+ VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
@@ -1490,6 +1591,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1504,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
@@ -1512,6 +1616,11 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_lru);
VMCOREINFO_NUMBER(PG_private);
VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
- return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
- size_t esize, gfp_t gfp_mask)
-{
- /*
- * round down to the next power of 2, since our 'let the indices
- * wrap' technique works only in this case.
- */
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
-
- if (size < 2) {
- fifo->data = NULL;
- fifo->mask = 0;
- return -EINVAL;
- }
-
- fifo->data = kmalloc(size * esize, gfp_mask);
-
- if (!fifo->data) {
- fifo->mask = 0;
- return -ENOMEM;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-
-void __kfifo_free(struct __kfifo *fifo)
-{
- kfree(fifo->data);
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = 0;
- fifo->data = NULL;
- fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
- unsigned int size, size_t esize)
-{
- size /= esize;
-
- if (!is_power_of_2(size))
- size = rounddown_pow_of_two(size);
-
- fifo->in = 0;
- fifo->out = 0;
- fifo->esize = esize;
- fifo->data = buffer;
-
- if (size < 2) {
- fifo->mask = 0;
- return -EINVAL;
- }
- fifo->mask = size - 1;
-
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(fifo->data + off, src, l);
- memcpy(fifo->data, src + l, len - l);
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_in(struct __kfifo *fifo,
- const void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- kfifo_copy_in(fifo, buf, len, fifo->in);
- fifo->in += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
- unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- memcpy(dst, fifo->data + off, l);
- memcpy(dst + l, fifo->data, len - l);
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
-}
-
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- kfifo_copy_out(fifo, buf, len, fifo->out);
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-
-unsigned int __kfifo_out(struct __kfifo *fifo,
- void *buf, unsigned int len)
-{
- len = __kfifo_out_peek(fifo, buf, len);
- fifo->out += len;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
- const void __user *from, unsigned int len, unsigned int off,
- unsigned int *copied)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned long ret;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_from_user(fifo->data + off, from, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_from_user(fifo->data, from + l, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data in the fifo is up to date before
- * incrementing the fifo->in index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->in += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
- unsigned int len, unsigned int off, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- ret = copy_to_user(to, fifo->data + off, l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret + len - l, esize);
- else {
- ret = copy_to_user(to + l, fifo->data, len - l);
- if (unlikely(ret))
- ret = DIV_ROUND_UP(ret, esize);
- }
- /*
- * make sure that the data is copied before
- * incrementing the fifo->out index counter
- */
- smp_wmb();
- *copied = len - ret;
- /* return the number of elements which are not copied */
- return ret;
-}
-
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied)
-{
- unsigned int l;
- unsigned long ret;
- unsigned int esize = fifo->esize;
- int err;
-
- if (esize != 1)
- len /= esize;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
- if (unlikely(ret)) {
- len -= ret;
- err = -EFAULT;
- } else
- err = 0;
- fifo->out += len;
- return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
- int nents, unsigned int len)
-{
- int n;
- unsigned int l;
- unsigned int off;
- struct page *page;
-
- if (!nents)
- return 0;
-
- if (!len)
- return 0;
-
- n = 0;
- page = virt_to_page(buf);
- off = offset_in_page(buf);
- l = 0;
-
- while (len >= l + PAGE_SIZE - off) {
- struct page *npage;
-
- l += PAGE_SIZE;
- buf += PAGE_SIZE;
- npage = virt_to_page(buf);
- if (page_to_phys(page) != page_to_phys(npage) - l) {
- sg_set_page(sgl, page, l - off, off);
- sgl = sg_next(sgl);
- if (++n == nents || sgl == NULL)
- return n;
- page = npage;
- len -= l - off;
- l = off = 0;
- }
- }
- sg_set_page(sgl, page, len, off);
- return n + 1;
-}
-
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
- int nents, unsigned int len, unsigned int off)
-{
- unsigned int size = fifo->mask + 1;
- unsigned int esize = fifo->esize;
- unsigned int l;
- unsigned int n;
-
- off &= fifo->mask;
- if (esize != 1) {
- off *= esize;
- size *= esize;
- len *= esize;
- }
- l = min(len, size - off);
-
- n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
- n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-
- return n;
-}
-
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = kfifo_unused(fifo);
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len)
-{
- unsigned int l;
-
- l = fifo->in - fifo->out;
- if (len > l)
- len = l;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
- unsigned int max = (1 << (recsize << 3)) - 1;
-
- if (len > max)
- return max;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_max_r);
-
-#define __KFIFO_PEEK(data, out, mask) \
- ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int l;
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- l = __KFIFO_PEEK(data, fifo->out, mask);
-
- if (--recsize)
- l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-
- return l;
-}
-
-#define __KFIFO_POKE(data, in, mask, val) \
- ( \
- (data)[(in) & (mask)] = (unsigned char)(val) \
- )
-
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
- unsigned int mask = fifo->mask;
- unsigned char *data = fifo->data;
-
- __KFIFO_POKE(data, fifo->in, mask, n);
-
- if (recsize > 1)
- __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
- return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
- unsigned int len, size_t recsize)
-{
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- __kfifo_poke_n(fifo, len, recsize);
-
- kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
- fifo->in += len + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
- void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
- *n = __kfifo_peek_n(fifo, recsize);
-
- if (len > *n)
- len = *n;
-
- kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
- return len;
-}
-
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
- unsigned int len, size_t recsize)
-{
- unsigned int n;
-
- if (fifo->in == fifo->out)
- return 0;
-
- len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
- fifo->out += n + recsize;
- return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int n;
-
- n = __kfifo_peek_n(fifo, recsize);
- fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo)) {
- *copied = 0;
- return 0;
- }
-
- __kfifo_poke_n(fifo, len, recsize);
-
- ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->in += len + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
- unsigned long len, unsigned int *copied, size_t recsize)
-{
- unsigned long ret;
- unsigned int n;
-
- if (fifo->in == fifo->out) {
- *copied = 0;
- return 0;
- }
-
- n = __kfifo_peek_n(fifo, recsize);
- if (len > n)
- len = n;
-
- ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
- if (unlikely(ret)) {
- *copied = 0;
- return -EFAULT;
- }
- fifo->out += n + recsize;
- return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > kfifo_unused(fifo))
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
- unsigned int len, size_t recsize)
-{
- len = __kfifo_max_r(len, recsize);
- __kfifo_poke_n(fifo, len, recsize);
- fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
- struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
- if (!nents)
- BUG();
-
- len = __kfifo_max_r(len, recsize);
-
- if (len + recsize > fifo->in - fifo->out)
- return 0;
-
- return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
- unsigned int len;
-
- len = __kfifo_peek_n(fifo, recsize);
- fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de6..1296e72e4161 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
#include <linux/suspend.h>
#include <linux/rwsem.h>
#include <linux/ptrace.h>
+#include <linux/async.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -76,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
static int call_modprobe(char *module_name, int wait)
{
+ struct subprocess_info *info;
static char *envp[] = {
"HOME=/",
"TERM=linux",
@@ -97,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
argv[3] = module_name; /* check free_modprobe_argv() */
argv[4] = NULL;
- return call_usermodehelper_fns(modprobe_path, argv, envp,
- wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+ info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
+ NULL, free_modprobe_argv, NULL);
+ if (!info)
+ goto free_module_name;
+
+ return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+
+free_module_name:
+ kfree(module_name);
free_argv:
kfree(argv);
out:
@@ -130,6 +139,14 @@ int __request_module(bool wait, const char *fmt, ...)
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ /*
+ * We don't allow synchronous module loading from async. Module
+ * init may invoke async_synchronize_full() which will end up
+ * waiting for this task which already is waiting for the module
+ * loading to complete, leading to a deadlock.
+ */
+ WARN_ON_ONCE(wait && current_is_async());
+
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
@@ -493,14 +510,28 @@ static void helper_unlock(void)
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
*/
-static
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
- char **envp, gfp_t gfp_mask)
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -511,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
out:
return sub_info;
}
-
-/**
- * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-static
-void call_usermodehelper_setfns(struct subprocess_info *info,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- info->cleanup = cleanup;
- info->init = init;
- info->data = data;
-}
+EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
* @wait: wait for the application to finish and return status.
- * when -1 don't wait at all, but you get no useful error back when
- * the program couldn't be exec'ed. This makes it safe to call
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
-static
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
@@ -606,31 +614,34 @@ unlock:
helper_unlock();
return retval;
}
+EXPORT_SYMBOL(call_usermodehelper_exec);
-/*
- * call_usermodehelper_fns() will not run the caller-provided cleanup function
- * if a memory allocation failure is experienced. So the caller might need to
- * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
- * the necessaary cleanup within the caller.
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
*/
-int call_usermodehelper_fns(
- char *path, char **argv, char **envp, int wait,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *), void *data)
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
if (info == NULL)
return -ENOMEM;
- call_usermodehelper_setfns(info, init, cleanup, data);
-
return call_usermodehelper_exec(info, wait);
}
-EXPORT_SYMBOL(call_usermodehelper_fns);
+EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
struct kprobe __kprobes *get_kprobe(void *addr)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (p->addr == addr)
return p;
}
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
/*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
/* Start optimizer after OPTIMIZE_DELAY passed */
static __kprobes void kick_kprobe_optimizer(void)
{
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
- else
- /* Wake up all waiters */
- complete_all(&optimizer_comp);
}
/* Wait for completing optimization and unoptimization */
static __kprobes void wait_for_kprobe_optimizer(void)
{
- if (delayed_work_pending(&optimizing_work))
- wait_for_completion(&optimizer_comp);
+ mutex_lock(&kprobe_mutex);
+
+ while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+ mutex_unlock(&kprobe_mutex);
+
+ /* this will also make optimizing_work execute immmediately */
+ flush_delayed_work(&optimizing_work);
+ /* @optimizing_work might not have been queued yet, relax */
+ cpu_relax();
+
+ mutex_lock(&kprobe_mutex);
+ }
+
+ mutex_unlock(&kprobe_mutex);
}
/* Optimize kprobe if p is ready to be optimized */
@@ -788,53 +794,58 @@ out:
}
#ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
- return;
+ goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
}
-/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization)
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
return;
+ }
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
}
+ mutex_unlock(&kprobe_mutex);
+
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -842,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
- mutex_lock(&kprobe_mutex);
+ mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -850,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
@@ -919,7 +930,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
}
#endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +975,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
(unsigned long)p->addr, 1, 0);
WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
}
-#else /* !KPROBES_CAN_USE_FTRACE */
+#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p) do {} while (0)
#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1152,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *tmp;
unsigned long hash, flags = 0;
if (unlikely(!kprobes_initialized))
@@ -1152,12 +1163,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task == tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1166,9 +1177,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
- hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+ hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
@@ -1178,14 +1189,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
- struct hlist_node *pos, *next;
+ struct hlist_node *next;
struct hlist_head *head;
/* No race here */
for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
kretprobe_table_lock(hash, &flags);
head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+ hlist_for_each_entry_safe(ri, next, head, hlist) {
if (ri->rp == rp)
ri->rp = NULL;
}
@@ -1414,12 +1425,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
*/
ftrace_addr = ftrace_location((unsigned long)p->addr);
if (ftrace_addr) {
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
/* Given address is not on the instruction boundary */
if ((unsigned long)p->addr != ftrace_addr)
return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
-#else /* !KPROBES_CAN_USE_FTRACE */
+#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
#endif
}
@@ -2021,7 +2032,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
{
struct module *mod = data;
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2048,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2195,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
@@ -2194,7 +2203,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
head = &kprobe_table[i];
preempt_disable();
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2238,6 @@ static const struct file_operations debugfs_kprobes_operations = {
static void __kprobes arm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2242,7 +2250,7 @@ static void __kprobes arm_all_kprobes(void)
/* Arming kprobes doesn't optimize kprobe itself */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist)
if (!kprobe_disabled(p))
arm_kprobe(p);
}
@@ -2258,7 +2266,6 @@ already_enabled:
static void __kprobes disarm_all_kprobes(void)
{
struct hlist_head *head;
- struct hlist_node *node;
struct kprobe *p;
unsigned int i;
@@ -2275,7 +2282,7 @@ static void __kprobes disarm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, node, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
disarm_kprobe(p, false);
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
+#include <linux/uaccess.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -52,8 +53,21 @@ enum KTHREAD_BITS {
KTHREAD_IS_PARKED,
};
-#define to_kthread(tsk) \
- container_of((tsk)->vfork_done, struct kthread, exited)
+#define __to_kthread(vfork) \
+ container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+ return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+ struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork))
+ return __to_kthread(vfork);
+ return NULL;
+}
/**
* kthread_should_stop - should this kthread return now?
@@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task. Return the data value specified when it
+ * was created if accessible. If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned. This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+ struct kthread *kthread = to_kthread(task);
+ void *data = NULL;
+
+ probe_kernel_read(&data, &kthread->data, sizeof(data));
+ return data;
+}
+
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
@@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+ p->flags |= PF_NO_SETAFFINITY;
}
/**
@@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
- }
- __kthread_bind(p, cpu);
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
@@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
}
-static struct kthread *task_get_live_kthread(struct task_struct *k)
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
{
- struct kthread *kthread;
-
- get_task_struct(k);
- kthread = to_kthread(k);
- /* It might have exited */
- barrier();
- if (k->vfork_done != NULL)
- return kthread;
- return NULL;
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
+ }
}
/**
@@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
*/
void kthread_unpark(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
- if (kthread) {
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- /*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
- */
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu);
- wake_up_process(k);
- }
- }
- put_task_struct(k);
+ if (kthread)
+ __kthread_unpark(k, kthread);
}
/**
@@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k)
*/
int kthread_park(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
int ret = -ENOSYS;
if (kthread) {
@@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k)
}
ret = 0;
}
- put_task_struct(k);
return ret;
}
@@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k)
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
+
+ get_task_struct(k);
+ kthread = to_live_kthread(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ __kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
-
put_task_struct(k);
- trace_sched_kthread_stop_ret(ret);
+ trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..1f3186b37fd5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
+
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
- printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
- printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr, *hlock_next;
+ struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2052,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2060,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
- hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock_next->irq_context)
+ if (hlock_curr->irq_context != hlock->irq_context)
break;
- hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -2997,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
static int
print_lock_nested_lock_not_held(struct task_struct *curr,
@@ -3190,9 +3192,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
+ curr->lockdep_depth, MAX_LOCK_DEPTH);
+
+ lockdep_print_held_locks(current);
+ debug_show_all_locks();
dump_stack();
+
return 0;
}
@@ -3203,7 +3210,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
unsigned long ip)
{
if (!debug_locks_off())
@@ -3246,7 +3253,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 0;
if (curr->lockdep_depth <= 0)
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
return 1;
}
@@ -3317,7 +3324,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3391,7 @@ lock_release_non_nested(struct task_struct *curr,
goto found_it;
prev_hlock = hlock;
}
- return print_unlock_inbalance_bug(curr, lock, ip);
+ return print_unlock_imbalance_bug(curr, lock, ip);
found_it:
if (hlock->instance == lock)
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
+#include <linux/export.h>
#define GLOBAL(name) \
- .globl ASM_SYMBOL(name); \
- ASM_SYMBOL(name):
+ .globl VMLINUX_SYMBOL(name); \
+ VMLINUX_SYMBOL(name):
.section ".init.data","aw"
diff --git a/kernel/module.c b/kernel/module.c
index eab08274ec9b..b049939177f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
return -ENOENT;
}
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+ enum lockdep_ok lockdep_ok)
{
- add_taint(flag);
+ add_taint(flag, lockdep_ok);
mod->taints |= (1U << flag);
}
@@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
{
int ret = (flags & O_TRUNC);
if (ret)
- add_taint(TAINT_FORCED_RMMOD);
+ add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
return ret;
}
#else
@@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
if (!test_taint(TAINT_FORCED_MODULE))
printk(KERN_WARNING "%s: %s: kernel tainted.\n",
mod->name, reason);
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
return 0;
#else
return -ENOEXEC;
@@ -1208,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
/* Since this should be found in kernel (which can't be removed),
* no locking is necessary. */
- if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
&crc, true, false))
BUG();
- return check_version(sechdrs, versindex, "module_layout", mod, crc,
+ return check_version(sechdrs, versindex,
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
}
@@ -1860,12 +1862,12 @@ static void free_module(struct module *mod)
{
trace_module_free(mod);
- /* Delete from various lists */
- mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
- mutex_unlock(&module_mutex);
mod_sysfs_teardown(mod);
+ /* We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed. */
+ mod->state = MODULE_STATE_UNFORMED;
+
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1878,6 +1880,11 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ stop_machine(__unlink_module, mod, NULL);
+ mutex_unlock(&module_mutex);
+
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
@@ -2147,7 +2154,8 @@ static void set_license(struct module *mod, const char *license)
if (!test_taint(TAINT_PROPRIETARY_MODULE))
printk(KERN_WARNING "%s: module license '%s' taints "
"kernel.\n", mod->name, license);
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
}
}
@@ -2539,7 +2547,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
if (err)
goto out;
- err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
+ err = vfs_getattr(&file->f_path, &stat);
if (err)
goto out;
@@ -2700,10 +2708,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
}
if (!get_modinfo(info, "intree"))
- add_taint_module(mod, TAINT_OOT_MODULE);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP);
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
printk(KERN_WARNING "%s: module is from the staging directory,"
" the quality is unknown, you have been warned.\n",
mod->name);
@@ -2869,15 +2877,17 @@ static int check_module_license_and_versions(struct module *mod)
* using GPL-only symbols it needs.
*/
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
/* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
/* lve claims to be GPL but upstream won't provide source */
if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
@@ -3141,12 +3151,72 @@ static int may_init_module(void)
return 0;
}
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources. In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+ int err;
+ struct module *old;
+
+ mod->state = MODULE_STATE_UNFORMED;
+
+again:
+ mutex_lock(&module_mutex);
+ if ((old = find_module_all(mod->name, true)) != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto out_unlocked;
+ goto again;
+ }
+ err = -EEXIST;
+ goto out;
+ }
+ list_add_rcu(&mod->list, &modules);
+ err = 0;
+
+out:
+ mutex_unlock(&module_mutex);
+out_unlocked:
+ return err;
+}
+
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+ int err;
+
+ mutex_lock(&module_mutex);
+
+ /* Find duplicate symbols (must be called under lock). */
+ err = verify_export_symbols(mod);
+ if (err < 0)
+ goto out;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
+
+out:
+ mutex_unlock(&module_mutex);
+ return err;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
- struct module *mod, *old;
+ struct module *mod;
long err;
err = module_sig_check(info);
@@ -3164,36 +3234,20 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
- /*
- * We try to place it in the list now to make sure it's unique
- * before we dedicate too many resources. In particular,
- * temporary percpu memory exhaustion.
- */
- mod->state = MODULE_STATE_UNFORMED;
-again:
- mutex_lock(&module_mutex);
- if ((old = find_module_all(mod->name, true)) != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto free_module;
- goto again;
- }
- err = -EEXIST;
- mutex_unlock(&module_mutex);
+ /* Reserve our place in the list. */
+ err = add_unformed_module(mod);
+ if (err)
goto free_module;
- }
- list_add_rcu(&mod->list, &modules);
- mutex_unlock(&module_mutex);
#ifdef CONFIG_MODULE_SIG
mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok)
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ if (!mod->sig_ok) {
+ printk_once(KERN_NOTICE
+ "%s: module verification failed: signature and/or"
+ " required key missing - tainting kernel\n",
+ mod->name);
+ add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+ }
#endif
/* Now module is in final location, initialize linked lists, etc. */
@@ -3236,21 +3290,11 @@ again:
dynamic_debug_setup(info->debug, info->num_debug);
- mutex_lock(&module_mutex);
- /* Find duplicate symbols (must be called under lock). */
- err = verify_export_symbols(mod);
- if (err < 0)
+ /* Finally it's fully formed, ready to start executing. */
+ err = complete_formation(mod, info);
+ if (err)
goto ddebug_cleanup;
- /* This relies on module_mutex for list integrity. */
- module_bug_finalize(info->hdr, info->sechdrs, mod);
-
- /* Mark state as coming so strong_try_module_get() ignores us,
- * but kallsyms etc. can see us. */
- mod->state = MODULE_STATE_COMING;
-
- mutex_unlock(&module_mutex);
-
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, &ddebug_dyndbg_module_param_cb);
@@ -3274,8 +3318,8 @@ again:
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
- ddebug_cleanup:
mutex_unlock(&module_mutex);
+ ddebug_cleanup:
dynamic_debug_remove(info->debug);
synchronize_sched();
kfree(mod->args);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
*/
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
@@ -36,6 +37,12 @@
# include <asm/mutex.h>
#endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
+
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -43,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ lock->spin_mlock = NULL;
+#endif
debug_mutex_init(lock, name, key);
}
@@ -94,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+ struct mspin_node *next ;
+ int locked; /* 1 if lock acquired */
+};
+#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
+
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ int retval = 1;
+
+ rcu_read_lock();
+ if (lock->owner)
+ retval = lock->owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+#endif
+
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
@@ -157,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
*/
+ if (!mutex_can_spin_on_owner(lock))
+ goto slowpath;
for (;;) {
struct task_struct *owner;
+ struct mspin_node node;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
+ mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
+ if (owner && !mutex_spin_on_owner(lock, owner)) {
+ mspin_unlock(MLOCK(lock), &node);
break;
+ }
- if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ if ((atomic_read(&lock->count) == 1) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
+ mspin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
+ mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -194,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
+slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -204,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
goto done;
lock_contended(&lock->dep_map, ip);
@@ -219,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) &&
+ (atomic_xchg(&lock->count, -1) == 1))
break;
/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 78e2ecb20165..364ceab15f0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
goto out;
}
- new_ns = create_new_namespaces(flags, tsk,
- task_cred_xxx(tsk, user_ns), tsk->fs);
+ new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns)) {
err = PTR_ERR(new_ns);
goto out;
@@ -242,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_inode *ei;
+ struct proc_ns *ei;
struct file *file;
int err;
@@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = PROC_I(file->f_dentry->d_inode);
+ ei = get_proc_ns(file_inode(file));
ops = ei->ns_ops;
if (nstype && (ops->type != nstype))
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
-#include <linux/dmi.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -259,26 +258,19 @@ unsigned long get_taint(void)
return tainted_mask;
}
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
{
- /*
- * Can't trust the integrity of the kernel anymore.
- * We don't call directly debug_locks_off() because the issue
- * is not necessarily serious enough to set oops_in_progress to 1
- * Also we want to keep up lockdep for staging/out-of-tree
- * development and post-warning case.
- */
- switch (flag) {
- case TAINT_CRAP:
- case TAINT_OOT_MODULE:
- case TAINT_WARN:
- case TAINT_FIRMWARE_WORKAROUND:
- break;
-
- default:
- if (__debug_locks_off())
- printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
- }
+ if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
+ printk(KERN_WARNING
+ "Disabling lock debugging due to kernel taint\n");
set_bit(flag, &tainted_mask);
}
@@ -407,13 +399,8 @@ struct slowpath_args {
static void warn_slowpath_common(const char *file, int line, void *caller,
unsigned taint, struct slowpath_args *args)
{
- const char *board;
-
printk(KERN_WARNING "------------[ cut here ]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (board)
- printk(KERN_WARNING "Hardware name: %s\n", board);
if (args)
vprintk(args->fmt, args->args);
@@ -421,7 +408,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
print_modules();
dump_stack();
print_oops_end_marker();
- add_taint(taint);
+ /* Just a warning, don't kill lockdep. */
+ add_taint(taint, LOCKDEP_STILL_OK);
}
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be536..53b958fcd639 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
GFP_KERNEL);
if (!new) {
- kfree(mk->mp);
+ kfree(attrs);
err = -ENOMEM;
goto fail;
}
+ /* Despite looking like the typical realloc() bug, this is safe.
+ * We *want* the old 'attrs' to be freed either way, and we'll store
+ * the new one in the success case. */
attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
if (!attrs) {
err = -ENOMEM;
diff --git a/kernel/pid.c b/kernel/pid.c
index f2c6a6825098..0db3e791a06d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define BITS_PER_PAGE (PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-
static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
break;
}
if (likely(atomic_read(&map->nr_free))) {
- do {
+ for ( ; ; ) {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
set_last_pid(pid_ns, last, pid);
return pid;
}
offset = find_next_offset(map, offset);
+ if (offset >= BITS_PER_PAGE)
+ break;
pid = mk_pid(pid_ns, map, offset);
- } while (offset < BITS_PER_PAGE && pid < pid_max);
+ if (pid >= pid_max)
+ break;
+ }
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
@@ -350,10 +352,9 @@ void disable_pid_allocation(struct pid_namespace *ns)
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
- struct hlist_node *elem;
struct upid *pnr;
- hlist_for_each_entry_rcu(pnr, elem,
+ hlist_for_each_entry_rcu(pnr,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
return container_of(pnr, struct pid,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..6917e8edb48e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,12 +15,10 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
-#define BITS_PER_PAGE (PAGE_SIZE*8)
-
struct pid_cache {
int nr_ids;
char name[16];
@@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int nr;
int rc;
struct task_struct *task, *me = current;
+ int init_pids = thread_group_leader(me) ? 1 : 2;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (pid_ns->nr_hashed == 1)
+ if (pid_ns->nr_hashed == init_pids)
break;
schedule();
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d6..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
#include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,13 +155,36 @@ static void bump_cpu_timer(struct k_itimer *timer,
}
}
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
static inline cputime_t prof_ticks(struct task_struct *p)
{
- return p->utime + p->stime;
+ cputime_t utime, stime;
+
+ task_cputime(p, &utime, &stime);
+
+ return utime + stime;
}
static inline cputime_t virt_ticks(struct task_struct *p)
{
- return p->utime;
+ cputime_t utime;
+
+ task_cputime(p, &utime, NULL);
+
+ return utime;
}
static int
@@ -471,18 +496,23 @@ static void cleanup_timers(struct list_head *head,
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
+ cputime_t utime, stime;
+
add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
sizeof(unsigned long long));
+ task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+ utime, stime, tsk->se.sum_exec_runtime);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
struct signal_struct *const sig = tsk->signal;
+ cputime_t utime, stime;
+ task_cputime(tsk, &utime, &stime);
cleanup_timers(tsk->signal->cpu_timers,
- tsk->utime + sig->utime, tsk->stime + sig->stime,
+ utime + sig->utime, stime + sig->stime,
tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
@@ -623,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+ tick_nohz_full_kick_all();
+}
+
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+ schedule_work(&nohz_kick_work);
+}
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return false;
+
+ if (tsk->signal->cputimer.running)
+ return false;
+
+ return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -781,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
+ if (!ret)
+ posix_cpu_timer_kick_nohz();
return ret;
}
@@ -995,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
}
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1226,11 +1274,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
+ cputime_t utime, stime;
+
+ task_cputime(tsk, &utime, &stime);
if (!task_cputime_zero(&tsk->cputime_expires)) {
struct task_cputime task_sample = {
- .utime = tsk->utime,
- .stime = tsk->stime,
+ .utime = utime,
+ .stime = stime,
.sum_exec_runtime = tsk->se.sum_exec_runtime
};
@@ -1320,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
+
+ /*
+ * In case some timers were rescheduled after the queue got emptied,
+ * wake up full dynticks CPUs.
+ */
+ if (tsk->signal->cputimer.running)
+ posix_cpu_timer_kick_nohz();
}
/*
@@ -1350,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
if (!*newval)
- return;
+ goto out;
*newval += now.cpu;
}
@@ -1368,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
+out:
+ posix_cpu_timer_kick_nohz();
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
@@ -1401,8 +1461,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
while (!signal_pending(current)) {
if (timer.it.cpu.expires.sched == 0) {
/*
- * Our timer fired and was reset.
+ * Our timer fired and was reset, below
+ * deletion can not fail.
*/
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return 0;
}
@@ -1420,9 +1482,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ if (!error) {
+ /*
+ * Timer is now unarmed, deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ }
spin_unlock_irq(&timer.it_lock);
+ while (error == TIMER_RETRY) {
+ /*
+ * We need to handle case when timer was or is in the
+ * middle of firing. In other cases we already freed
+ * resources.
+ */
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ }
+
if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
+#include <linux/hashtable.h>
/*
- * Management arrays for POSIX timers. Timers are kept in slab memory
- * Timer ids are allocated by an external routine that keeps track of the
- * id and the timer. The external interface is:
- *
- * void *idr_find(struct idr *idp, int id); to find timer_id <id>
- * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
- * related it to <ptr>
- * void idr_remove(struct idr *idp, int id); to release <id>
- * void idr_init(struct idr *idp); to initialize <idp>
- * which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock. Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast. A -1 return
- * indicates that the requested id does not exist.
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
*/
/*
* Lets keep our timers in a slab cache :-)
*/
static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
/*
* we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
__timr; \
})
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+ return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+ struct signal_struct *sig,
+ timer_t id)
+{
+ struct k_itimer *timer;
+
+ hlist_for_each_entry_rcu(timer, head, t_hash) {
+ if ((timer->it_signal == sig) && (timer->it_id == id))
+ return timer;
+ }
+ return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+ return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+ struct signal_struct *sig = current->signal;
+ int first_free_id = sig->posix_timer_id;
+ struct hlist_head *head;
+ int ret = -ENOENT;
+
+ do {
+ spin_lock(&hash_lock);
+ head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+ if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ hlist_add_head_rcu(&timer->t_hash, head);
+ ret = sig->posix_timer_id;
+ }
+ if (++sig->posix_timer_id < 0)
+ sig->posix_timer_id = 0;
+ if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+ /* Loop over all possible ids completed */
+ ret = -EAGAIN;
+ spin_unlock(&hash_lock);
+ } while (ret == -ENOENT);
+ return ret;
+}
+
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+ timekeeping_clocktai(tp);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
.clock_getres = posix_get_coarse_res,
.clock_get = posix_get_monotonic_coarse,
};
+ struct k_clock clock_tai = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_tai,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
struct k_clock clock_boottime = {
.clock_getres = hrtimer_get_res,
.clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+ posix_timers_register_clock(CLOCK_TAI, &clock_tai);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
NULL);
- idr_init(&posix_timers_id);
return 0;
}
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
{
if (it_id_set) {
unsigned long flags;
- spin_lock_irqsave(&idr_lock, flags);
- idr_remove(&posix_timers_id, tmr->it_id);
- spin_unlock_irqrestore(&idr_lock, flags);
+ spin_lock_irqsave(&hash_lock, flags);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock_irqrestore(&hash_lock, flags);
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
@@ -552,22 +610,9 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
- retry:
- if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
- error = -EAGAIN;
- goto out;
- }
- spin_lock_irq(&idr_lock);
- error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
- spin_unlock_irq(&idr_lock);
- if (error) {
- if (error == -EAGAIN)
- goto retry;
- /*
- * Weird looking, but we return EAGAIN if the IDR is
- * full (proper POSIX return value for this)
- */
- error = -EAGAIN;
+ new_timer_id = posix_timer_add(new_timer);
+ if (new_timer_id < 0) {
+ error = new_timer_id;
goto out;
}
@@ -639,8 +684,15 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
rcu_read_lock();
- timr = idr_find(&posix_timers_id, (int)timer_id);
+ timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
@@ -997,7 +1049,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
err = kc->clock_adj(which_clock, &ktx);
- if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+ if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
void queue_up_suspend_work(void)
{
- if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+ if (autosleep_state > PM_SUSPEND_ON)
queue_work(autosleep_wq, &suspend_work);
}
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
* Originally from swsusp.
*/
+#include <linux/console.h>
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/vt.h>
@@ -14,8 +15,120 @@
static int orig_fgconsole, orig_kmsg;
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+ struct list_head head;
+ struct device *dev;
+ bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument. If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+ struct pm_vt_switch *entry, *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ /* already registered, update requirement */
+ tmp->required = required;
+ goto out;
+ }
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out;
+
+ entry->required = required;
+ entry->dev = dev;
+
+ list_add(&entry->head, &pm_vt_switch_list);
+out:
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+ struct pm_vt_switch *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ list_del(&tmp->head);
+ break;
+ }
+ }
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ * 1) no driver has indicated a requirement one way or another, so preserve
+ * the old behavior
+ * 2) console suspend is disabled, we want to see debug messages across
+ * suspend/resume
+ * 3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+ struct pm_vt_switch *entry;
+ bool ret = true;
+
+ mutex_lock(&vt_switch_mutex);
+ if (list_empty(&pm_vt_switch_list))
+ goto out;
+
+ if (!console_suspend_enabled)
+ goto out;
+
+ list_for_each_entry(entry, &pm_vt_switch_list, head) {
+ if (entry->required)
+ goto out;
+ }
+
+ ret = false;
+out:
+ mutex_unlock(&vt_switch_mutex);
+ return ret;
+}
+
int pm_prepare_console(void)
{
+ if (!pm_vt_switch())
+ return 0;
+
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
+ if (!pm_vt_switch())
+ return;
+
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_STANDBY;
+ suspend_state_t state = PM_SUSPEND_MIN;
const char * const *s;
#endif
char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
#endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long val;
+
+ if (kstrtoul(buf, 10, &val))
+ return -EINVAL;
+
+ freeze_timeout_msecs = val;
+ return n;
+}
+
+power_attr(pm_freeze_timeout);
+
+#endif /* CONFIG_FREEZER*/
+
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
&pm_print_times_attr.attr,
#endif
#endif
+#ifdef CONFIG_FREEZER
+ &pm_freeze_timeout_attr.attr,
+#endif
NULL,
};
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc9..7ef6866b521d 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
- .help_msg = "powerOff",
+ .help_msg = "poweroff(o)",
.action_msg = "Power Off",
.enable_mask = SYSRQ_ENABLE_BOOT,
};
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
/*
* Timeout for stopping processes
*/
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
do_gettimeofday(&start);
- end_time = jiffies + TIMEOUT;
+ end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
if (!user_only)
freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
"%s called for unknown object.", __func__))
return;
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
if (new_value != req->node.prio)
pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
return;
}
- if (delayed_work_pending(&req->work))
- cancel_delayed_work_sync(&req->work);
+ cancel_delayed_work_sync(&req->work);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..bef86d121eb2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
#include "power.h"
const char *const pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = "freeze",
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+ return !!(state > PM_SUSPEND_FREEZE);
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+
+static void freeze_begin(void)
+{
+ suspend_freeze_wake = false;
+}
+
+static void freeze_enter(void)
+{
+ wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+
+void freeze_wake(void)
+{
+ suspend_freeze_wake = true;
+ wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
@@ -50,8 +76,23 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
bool valid_state(suspend_state_t state)
{
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE &&
+ pm_test_level != TEST_FREEZER &&
+ pm_test_level != TEST_DEVICES &&
+ pm_test_level != TEST_PLATFORM) {
+ printk(KERN_WARNING "Unsupported pm_test mode for "
+ "freeze state, please choose "
+ "none/freezer/devices/platform.\n");
+ return false;
+ }
+#endif
+ return true;
+ }
/*
- * All states need lowlevel support and need to be valid to the lowlevel
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+ * support and need to be valid to the lowlevel
* implementation, no valid callback implies that none are valid.
*/
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +130,11 @@ static int suspend_test(int level)
* hibernation). Run suspend notifiers, allocate the "suspend" console and
* freeze processes.
*/
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
{
int error;
- if (!suspend_ops || !suspend_ops->enter)
+ if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
return -EPERM;
pm_prepare_console();
@@ -137,7 +178,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (suspend_ops->prepare) {
+ if (need_suspend_ops(state) && suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Platform_finish;
@@ -149,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_finish;
}
- if (suspend_ops->prepare_late) {
+ if (need_suspend_ops(state) && suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Platform_wake;
@@ -158,6 +199,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
+ /*
+ * PM_SUSPEND_FREEZE equals
+ * frozen processes + suspended devices + idle processors.
+ * Thus we should invoke freeze_enter() soon after
+ * all the devices are suspended.
+ */
+ if (state == PM_SUSPEND_FREEZE) {
+ freeze_enter();
+ goto Platform_wake;
+ }
+
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -182,13 +234,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- if (suspend_ops->wake)
+ if (need_suspend_ops(state) && suspend_ops->wake)
suspend_ops->wake();
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (suspend_ops->finish)
+ if (need_suspend_ops(state) && suspend_ops->finish)
suspend_ops->finish();
return error;
@@ -203,11 +255,11 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (!suspend_ops)
+ if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
trace_machine_suspend(state);
- if (suspend_ops->begin) {
+ if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
@@ -226,7 +278,7 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup
+ } while (!error && !wakeup && need_suspend_ops(state)
&& suspend_ops->suspend_again && suspend_ops->suspend_again());
Resume_devices:
@@ -236,13 +288,13 @@ int suspend_devices_and_enter(suspend_state_t state)
ftrace_start();
resume_console();
Close:
- if (suspend_ops->end)
+ if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
- if (suspend_ops->recover)
+ if (need_suspend_ops(state) && suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
@@ -278,12 +330,15 @@ static int enter_state(suspend_state_t state)
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
+ if (state == PM_SUSPEND_FREEZE)
+ freeze_begin();
+
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
- error = suspend_prepare();
+ error = suspend_prepare(state);
if (error)
goto Unlock;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
rtc_set_alarm(rtc, &alm);
}
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
{
struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
if (!device_may_wakeup(candidate->dev.parent))
return 0;
- *(const char **)name_ptr = dev_name(dev);
return 1;
}
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
- char *pony = NULL;
struct rtc_device *rtc = NULL;
+ struct device *dev;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
}
/* RTCs have initialized by now too ... can we use one? */
- class_find_device(rtc_class, NULL, &pony, has_wakealarm);
- if (pony)
- rtc = rtc_class_open(pony);
+ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
+ if (dev)
+ rtc = rtc_class_open(dev_name(dev));
if (!rtc) {
printk(warn_no_rtc);
goto done;
diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce780abe8..fa36e1494420 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -42,19 +43,14 @@
#include <linux/notifier.h>
#include <linux/rculist.h>
#include <linux/poll.h>
+#include <linux/irq_work.h>
+#include <linux/utsname.h>
#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
-
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -62,8 +58,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -87,6 +81,12 @@ static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+ .name = "console_lock"
+};
+#endif
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -217,6 +217,7 @@ struct log {
static DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
@@ -602,7 +603,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
- ret = POLLIN|POLLRDNORM;
+ else
+ ret = POLLIN|POLLRDNORM;
}
raw_spin_unlock_irq(&logbuf_lock);
@@ -1259,7 +1261,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
{
struct console *con;
- trace_console(text, 0, len, len);
+ trace_console(text, len);
if (level >= console_loglevel && !ignore_loglevel)
return;
@@ -1717,6 +1719,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
#endif /* CONFIG_PRINTK */
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+ if (early_console) {
+ char buf[512];
+ int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+ early_console->write(early_console, buf, n);
+ }
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ early_vprintk(fmt, ap);
+ va_end(ap);
+}
+#endif
+
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options)
{
@@ -1918,6 +1943,7 @@ void console_lock(void)
return;
console_locked = 1;
console_may_schedule = 1;
+ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
}
EXPORT_SYMBOL(console_lock);
@@ -1939,6 +1965,7 @@ int console_trylock(void)
}
console_locked = 1;
console_may_schedule = 0;
+ mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -1948,43 +1975,6 @@ int is_console_locked(void)
return console_locked;
}
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE 512
-
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_SCHED 0x02
-
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-
-void printk_tick(void)
-{
- if (__this_cpu_read(printk_pending)) {
- int pending = __this_cpu_xchg(printk_pending, 0);
- if (pending & PRINTK_PENDING_SCHED) {
- char *buf = __get_cpu_var(printk_sched_buf);
- printk(KERN_WARNING "[sched_delayed] %s", buf);
- }
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
- }
-}
-
-int printk_needs_cpu(int cpu)
-{
- if (cpu_is_offline(cpu))
- printk_tick();
- return __this_cpu_read(printk_pending);
-}
-
-void wake_up_klogd(void)
-{
- if (waitqueue_active(&log_wait))
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
-}
-
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2099,6 +2089,7 @@ skip:
local_irq_restore(flags);
}
console_locked = 0;
+ mutex_release(&console_lock_dep_map, 1, _RET_IP_);
/* Release the exclusive_console once it is used */
if (unlikely(exclusive_console))
@@ -2446,6 +2437,44 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE 512
+
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_SCHED 0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_SCHED) {
+ char *buf = __get_cpu_var(printk_sched_buf);
+ printk(KERN_WARNING "[sched_delayed] %s", buf);
+ }
+
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
+ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ }
+ preempt_enable();
+}
int printk_sched(const char *fmt, ...)
{
@@ -2462,6 +2491,7 @@ int printk_sched(const char *fmt, ...)
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
local_irq_restore(flags);
return r;
@@ -2821,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+static char dump_stack_arch_desc_str[128];
+
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps. Usually used to add arch-specific system identifiers. If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+ fmt, args);
+ va_end(args);
+}
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+ printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+ log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ if (dump_stack_arch_desc_str[0] != '\0')
+ printk("%sHardware name: %s\n",
+ log_lvl, dump_stack_arch_desc_str);
+
+ print_worker_info(log_lvl, current);
+}
+
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+ dump_stack_print_info(log_lvl);
+
+ printk("%stask: %p ti: %p task.ti: %p\n",
+ log_lvl, current, current_thread_info(),
+ task_thread_info(current));
+}
+
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..0bf400737660 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
-
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
- if (timer_hook)
- return -EBUSY;
- timer_hook = hook;
- return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
- WARN_ON(hook != timer_hook);
- timer_hook = NULL;
- /* make sure all CPUs see the NULL hook */
- synchronize_sched(); /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
-
-
#ifdef CONFIG_SMP
/*
* Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
{
struct pt_regs *regs = get_irq_regs();
- if (type == CPU_PROFILING && timer_hook)
- timer_hook(regs);
if (!user_mode(regs) && prof_cpu_mask != NULL &&
cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
profile_hit(type, (void *)profile_pc(regs));
@@ -486,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
.write = prof_cpu_mask_proc_write,
};
-void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+void create_prof_cpu_mask(void)
{
/* create /proc/irq/prof_cpu_mask */
- proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
}
/*
@@ -624,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
NULL, &proc_profile_operations);
if (!entry)
return 0;
- entry->size = (1+prof_len) * sizeof(atomic_t);
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
hotcpu_notifier(profile_cpu_callback, 0);
return 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae4406d..aed981a3f69c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
@@ -24,6 +25,7 @@
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
+#include <linux/compat.h>
static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
return error;
}
+static int ptrace_peek_siginfo(struct task_struct *child,
+ unsigned long addr,
+ unsigned long data)
+{
+ struct ptrace_peeksiginfo_args arg;
+ struct sigpending *pending;
+ struct sigqueue *q;
+ int ret, i;
+
+ ret = copy_from_user(&arg, (void __user *) addr,
+ sizeof(struct ptrace_peeksiginfo_args));
+ if (ret)
+ return -EFAULT;
+
+ if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
+ return -EINVAL; /* unknown flags */
+
+ if (arg.nr < 0)
+ return -EINVAL;
+
+ if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
+ pending = &child->signal->shared_pending;
+ else
+ pending = &child->pending;
+
+ for (i = 0; i < arg.nr; ) {
+ siginfo_t info;
+ s32 off = arg.off + i;
+
+ spin_lock_irq(&child->sighand->siglock);
+ list_for_each_entry(q, &pending->list, list) {
+ if (!off--) {
+ copy_siginfo(&info, &q->info);
+ break;
+ }
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+
+ if (off >= 0) /* beyond the end of the list */
+ break;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(is_compat_task())) {
+ compat_siginfo_t __user *uinfo = compat_ptr(data);
+
+ ret = copy_siginfo_to_user32(uinfo, &info);
+ ret |= __put_user(info.si_code, &uinfo->si_code);
+ } else
+#endif
+ {
+ siginfo_t __user *uinfo = (siginfo_t __user *) data;
+
+ ret = copy_siginfo_to_user(uinfo, &info);
+ ret |= __put_user(info.si_code, &uinfo->si_code);
+ }
+
+ if (ret) {
+ ret = -EFAULT;
+ break;
+ }
+
+ data += sizeof(siginfo_t);
+ i++;
+
+ if (signal_pending(current))
+ break;
+
+ cond_resched();
+ }
+
+ if (i > 0)
+ return i;
+
+ return ret;
+}
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -712,6 +789,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
kiov->iov_len, kiov->iov_base);
}
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code. We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
#endif
int ptrace_request(struct task_struct *child, long request,
@@ -742,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = put_user(child->ptrace_message, datalp);
break;
+ case PTRACE_PEEKSIGINFO:
+ ret = ptrace_peek_siginfo(child, addr, data);
+ break;
+
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, &siginfo);
if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed68..071b0ab455cb 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
range[i].end = range[j].end;
range[i].start = end;
} else {
- printk(KERN_ERR "run of slot in ranges\n");
+ pr_err("%s: run out of slot in ranges\n",
+ __func__);
}
range[j].end = start;
continue;
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
extern int rcu_expedited;
+#ifdef CONFIG_RCU_STALL_COMMON
+
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old, unsigned long c)
{
- trace_rcu_torture_read(rcutorturename, rhp);
+ trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
}
EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
#endif
+
+#ifdef CONFIG_RCU_STALL_COMMON
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
-#include "rcutiny_plugin.h"
-
static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+#include "rcutiny_plugin.h"
+
/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
static void rcu_idle_enter_common(long long newval)
{
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
* interrupts don't count, we must be running at the first interrupt
* level.
*/
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
{
return rcu_dynticks_nesting <= 1;
}
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
+ reset_cpu_stall_ticks(rcp);
if (rcp->rcucblist != NULL &&
rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ check_cpu_stalls();
if (user || rcu_is_cpu_rrupt_from_idle())
rcu_sched_qs(cpu);
else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
struct rcu_head **curtail; /* ->next pointer of last CB. */
RCU_TRACE(long qlen); /* Number of pending CBs. */
+ RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
RCU_TRACE(char *name); /* Name of RCU type. */
};
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_TRACE
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long j;
+ unsigned long js;
+
+ if (rcu_cpu_stall_suppress)
+ return;
+ rcp->ticks_this_gp++;
+ j = jiffies;
+ js = rcp->jiffies_stall;
+ if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+ pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+ rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+ jiffies - rcp->gp_start, rcp->qlen);
+ dump_stack();
+ }
+ if (*rcp->curtail && ULONG_CMP_GE(j, js))
+ rcp->jiffies_stall = jiffies +
+ 3 * rcu_jiffies_till_stall_check() + 3;
+ else if (ULONG_CMP_GE(j, js))
+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+
+static void check_cpu_stall_preempt(void);
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+ rcp->ticks_this_gp = 0;
+ rcp->gp_start = jiffies;
+ rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+
+static void check_cpu_stalls(void)
+{
+ RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+ RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+ RCU_TRACE(check_cpu_stall_preempt());
+}
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
/* Official start of GP. */
rcu_preempt_ctrlblk.gpnum++;
RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+ reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
/* Any blocked RCU readers block new GP. */
if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
MODULE_LICENSE("GPL");
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+ check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
+
#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
#include <linux/stat.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/trace_clock.h>
#include <asm/byteorder.h>
MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
#define rcu_can_boost() 0
#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+ u64 ts = trace_clock_local();
+ unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+ return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+ return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
/* Wait for the next test interval. */
oldstarttime = boost_starttime;
while (ULONG_CMP_LT(jiffies, oldstarttime)) {
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_interruptible(oldstarttime - jiffies);
rcu_stutter_wait("rcu_torture_boost");
if (kthread_should_stop() ||
fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
return;
if (atomic_xchg(&beenhere, 1) != 0)
return;
- do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
ftrace_dump(DUMP_ALL);
}
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
{
int idx;
int completed;
+ int completed_end;
static DEFINE_RCU_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
int pipe_count;
+ unsigned long long ts;
idx = cur_ops->readlock();
completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
cur_ops->readunlock(idx);
return;
}
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- if (pipe_count > 1)
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+ completed, completed_end);
rcutorture_trace_dump();
+ }
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
+ completed = completed_end - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
rcu_torture_reader(void *arg)
{
int completed;
+ int completed_end;
int idx;
DEFINE_RCU_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
struct timer_list t;
+ unsigned long long ts;
VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
}
idx = cur_ops->readlock();
completed = cur_ops->completed();
+ ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
schedule_timeout_interruptible(HZ);
continue;
}
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
- if (pipe_count > 1)
+ completed_end = cur_ops->completed();
+ if (pipe_count > 1) {
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+ ts, completed, completed_end);
rcutorture_trace_dump();
+ }
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = cur_ops->completed() - completed;
+ completed = completed_end - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
set_cpus_allowed_ptr(reader_tasks[i],
shuffle_tmp_mask);
}
-
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed_ptr(fakewriter_tasks[i],
shuffle_tmp_mask);
}
-
if (writer_task)
set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-
if (stats_task)
set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+ if (stutter_task)
+ set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+ if (fqs_task)
+ set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+ if (shutdown_task)
+ set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task)
+ set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ if (stall_task)
+ set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+ if (barrier_cbs_tasks)
+ for (i = 0; i < n_barrier_cbs; i++)
+ if (barrier_cbs_tasks[i])
+ set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+ shuffle_tmp_mask);
+ if (barrier_task)
+ set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
if (rcu_idle_cpu == -1)
rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
barrier_cbs_wq =
kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
GFP_KERNEL);
- if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+ if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
+ .abbr = sabbr, \
}
struct rcu_state rcu_sched_state =
- RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
@@ -105,7 +106,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
* The rcu_scheduler_active variable transitions from zero to one just
* before the first task is spawned. So when this variable is zero, RCU
* can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier(). When this variable
+ * optimize synchronize_sched() to a simple barrier(). When this variable
* is one, RCU must actually do all the hard work required to detect real
* grace periods. This variable is also used to suppress boot-time false
* positives from lockdep-RCU error checking.
@@ -217,18 +218,14 @@ module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
-
static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
@@ -305,17 +302,29 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
}
/*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
*/
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- struct rcu_head **ntp;
+ int i;
- ntp = rdp->nxttail[RCU_DONE_TAIL +
- (ACCESS_ONCE(rsp->completed) != rdp->completed)];
- return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
- !rcu_gp_in_progress(rsp);
+ if (rcu_gp_in_progress(rsp))
+ return 0; /* No, a grace period is already in progress. */
+ if (rcu_nocb_needs_gp(rsp))
+ return 1; /* Yes, a no-CBs CPU needs one. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL])
+ return 0; /* No, this is a no-CBs (or offline) CPU. */
+ if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+ return 1; /* Yes, this CPU has newly registered callbacks. */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+ ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+ rdp->nxtcompleted[i]))
+ return 1; /* Yes, CBs for future grace period. */
+ return 0; /* No grace period needed. */
}
/*
@@ -336,7 +345,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
bool user)
{
- trace_rcu_dyntick("Start", oldval, 0);
+ trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
@@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
* interrupt from idle, return true. The caller must have at least
* disabled preemption.
*/
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
{
return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
}
@@ -790,31 +799,23 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->offline_fqs++;
return 1;
}
- return 0;
-}
-
-static int jiffies_till_stall_check(void)
-{
- int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
/*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ * There is a possibility that a CPU in adaptive-ticks state
+ * might run in the kernel with the scheduling-clock tick disabled
+ * for an extended time period. Invoke rcu_kick_nohz_cpu() to
+ * force the CPU to restart the scheduling-clock tick in this
+ * CPU is in this state.
*/
- if (till_stall_check < 3) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+ rcu_kick_nohz_cpu(rdp->cpu);
+
+ return 0;
}
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
rsp->gp_start = jiffies;
- rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+ rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
}
/*
@@ -857,7 +858,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+ rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -935,7 +936,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
rsp->jiffies_stall = jiffies +
- 3 * jiffies_till_stall_check() + 3;
+ 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
set_need_resched(); /* kick ourselves to get things going. */
@@ -966,12 +967,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
}
}
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
/**
* rcu_cpu_stall_reset - prevent further stall warnings in current grace period
*
@@ -989,15 +984,6 @@ void rcu_cpu_stall_reset(void)
rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
}
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static void __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
-
/*
* Update CPU-local rcu_data state to record the newly noticed grace period.
* This is used both when we started the grace period and when we notice
@@ -1064,10 +1050,266 @@ static void init_callback_list(struct rcu_data *rdp)
{
int i;
+ if (init_nocb_callback_list(rdp))
+ return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- init_nocb_callback_list(rdp);
+}
+
+/*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period. This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ /*
+ * If RCU is idle, we just wait for the next grace period.
+ * But we can only be sure that RCU is idle if we are looking
+ * at the root rcu_node structure -- otherwise, a new grace
+ * period might have started, but just not yet gotten around
+ * to initializing the current non-root rcu_node structure.
+ */
+ if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+ return rnp->completed + 1;
+
+ /*
+ * Otherwise, wait for a possible partial grace period and
+ * then the subsequent full grace period.
+ */
+ return rnp->completed + 2;
+}
+
+/*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, char *s)
+{
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+ rnp->completed, c, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static unsigned long __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ /*
+ * Pick up grace-period number for new callbacks. If this
+ * grace period is already marked as needed, return to the caller.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+ if (rnp->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+ return c;
+ }
+
+ /*
+ * If either this rcu_node structure or the root rcu_node structure
+ * believe that a grace period is in progress, then we must wait
+ * for the one following, which is in "c". Because our request
+ * will be noticed at the end of the current grace period, we don't
+ * need to explicitly start one.
+ */
+ if (rnp->gpnum != rnp->completed ||
+ ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ rnp->need_future_gp[c & 0x1]++;
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+ return c;
+ }
+
+ /*
+ * There might be no grace period in progress. If we don't already
+ * hold it, acquire the root rcu_node structure's lock in order to
+ * start one (if needed).
+ */
+ if (rnp != rnp_root)
+ raw_spin_lock(&rnp_root->lock);
+
+ /*
+ * Get a new grace-period number. If there really is no grace
+ * period in progress, it will be smaller than the one we obtained
+ * earlier. Adjust callbacks as needed. Note that even no-CBs
+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+ rdp->nxtcompleted[i] = c;
+
+ /*
+ * If the needed for the required grace period is already
+ * recorded, trace and leave.
+ */
+ if (rnp_root->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+ goto unlock_out;
+ }
+
+ /* Record the need for the future grace period. */
+ rnp_root->need_future_gp[c & 0x1]++;
+
+ /* If a grace period is not already in progress, start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+ } else {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+ rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ }
+unlock_out:
+ if (rnp != rnp_root)
+ raw_spin_unlock(&rnp_root->lock);
+ return c;
+}
+
+/*
+ * Clean up any old requests for the just-ended grace period. Also return
+ * whether any additional grace periods have been requested. Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ int c = rnp->completed;
+ int needmore;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+
+ rcu_nocb_gp_cleanup(rsp, rnp);
+ rnp->need_future_gp[c & 0x1] = 0;
+ needmore = rnp->need_future_gp[(c + 1) & 0x1];
+ trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+ return needmore;
+}
+
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned. Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure. This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return;
+
+ /*
+ * Starting from the sublist containing the callbacks most
+ * recently assigned a ->completed number and working down, find the
+ * first sublist that is not assignable to an upcoming grace period.
+ * Such a sublist has something in it (first two tests) and has
+ * a ->completed number assigned that will complete sooner than
+ * the ->completed number for newly arrived callbacks (last test).
+ *
+ * The key point is that any later sublist can be assigned the
+ * same ->completed number as the newly arrived callbacks, which
+ * means that the callbacks in any of these later sublist can be
+ * grouped into a single sublist, whether or not they have already
+ * been assigned a ->completed number.
+ */
+ c = rcu_cbs_completed(rsp, rnp);
+ for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+ if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+ !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+ break;
+
+ /*
+ * If there are no sublist for unassigned callbacks, leave.
+ * At the same time, advance "i" one sublist, so that "i" will
+ * index into the sublist where all the remaining callbacks should
+ * be grouped into.
+ */
+ if (++i >= RCU_NEXT_TAIL)
+ return;
+
+ /*
+ * Assign all subsequent callbacks' ->completed number to the next
+ * full grace period and group them all in the sublist initially
+ * indexed by "i".
+ */
+ for (; i <= RCU_NEXT_TAIL; i++) {
+ rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+ rdp->nxtcompleted[i] = c;
+ }
+ /* Record any needed additional grace periods. */
+ rcu_start_future_gp(rnp, rdp);
+
+ /* Trace depending on how much we were able to accelerate. */
+ if (!*rdp->nxttail[RCU_WAIT_TAIL])
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+ else
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist. This function is idempotent, so it does not hurt to
+ * invoke it repeatedly. As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ int i, j;
+
+ /* If the CPU has no callbacks, nothing to do. */
+ if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+ return;
+
+ /*
+ * Find all callbacks whose ->completed numbers indicate that they
+ * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+ */
+ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+ if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+ break;
+ rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+ }
+ /* Clean up any sublist tail pointers that were misordered above. */
+ for (j = RCU_WAIT_TAIL; j < i; j++)
+ rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+
+ /* Copy down callbacks to fill in empty sublists. */
+ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+ if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+ break;
+ rdp->nxttail[j] = rdp->nxttail[i];
+ rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+ }
+
+ /* Classify any remaining callbacks. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
}
/*
@@ -1080,12 +1322,15 @@ static void
__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
/* Did another grace period end? */
- if (rdp->completed != rnp->completed) {
+ if (rdp->completed == rnp->completed) {
+
+ /* No, so just accelerate recent callbacks. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
+
+ } else {
- /* Advance callbacks. No harm if list empty. */
- rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ /* Advance callbacks. */
+ rcu_advance_cbs(rsp, rnp, rdp);
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
@@ -1195,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- rnp->gpnum = rsp->gpnum;
+ ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
WARN_ON_ONCE(rnp->completed != rsp->completed);
- rnp->completed = rsp->completed;
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1206,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
#ifdef CONFIG_PROVE_RCU_DELAY
- if ((random32() % (rcu_num_nodes * 8)) == 0)
+ if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+ system_state == SYSTEM_RUNNING)
schedule_timeout_uninterruptible(2);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
@@ -1248,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
+ int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1277,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
- rnp->completed = rsp->gpnum;
+ ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ rdp = this_cpu_ptr(rsp->rda);
+ if (rnp == rdp->mynode)
+ __rcu_process_gp_end(rsp, rnp, rdp);
+ nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched();
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
+ rcu_nocb_gp_set(rnp, nocb);
rsp->completed = rsp->gpnum; /* Declare grace period done. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
+ rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
if (cpu_needs_another_gp(rsp, rdp))
rsp->gp_flags = 1;
raw_spin_unlock_irq(&rnp->lock);
@@ -1363,64 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock, which is released before return. Hard irqs must
- * be disabled.
+ * the root node's ->lock and hard irqs must be disabled.
*
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
*/
static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- if (!rsp->gp_kthread ||
- !cpu_needs_another_gp(rsp, rdp)) {
+ if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
* task, this CPU does not need another grace period,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
-
- /*
- * Because there is no grace period in progress right now,
- * any callbacks we have up to this point will be satisfied
- * by the next grace period. So promote all callbacks to be
- * handled after the end of the next grace period. If the
- * CPU is not yet aware of the end of the previous grace period,
- * we need to allow for the callback advancement that will
- * occur when it does become aware. Deadlock prevents us from
- * making it aware at this point: We cannot acquire a leaf
- * rcu_node ->lock while holding the root rcu_node ->lock.
- */
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- if (rdp->completed == rsp->completed)
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
rsp->gp_flags = RCU_GP_FLAG_INIT;
- raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
-
- /* Ensure that CPU is aware of completion of last grace period. */
- rcu_process_gp_end(rsp, rdp);
- local_irq_restore(flags);
/* Wake up rcu_gp_kthread() to start the grace period. */
wake_up(&rsp->gp_wq);
}
/*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp)
+{
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /*
+ * If there is no grace period in progress right now, any
+ * callbacks we have up to this point will be satisfied by the
+ * next grace period. Also, advancing the callbacks reduces the
+ * probability of false positives from cpu_needs_another_gp()
+ * resulting in pointless grace periods. So, advance callbacks
+ * then start the grace period!
+ */
+ rcu_advance_cbs(rsp, rnp, rdp);
+ rcu_start_gp_advanced(rsp, rnp, rdp);
+}
+
+/*
* Report a full set of quiescent states to the specified rcu_state
* data structure. This involves cleaning up after the prior grace
* period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, as
- * required by rcu_start_gp(), which will release it.
+ * if one is needed. Note that the caller must hold rnp->lock, which
+ * is released before return.
*/
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
@@ -1527,7 +1778,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ rcu_accelerate_cbs(rsp, rnp, rdp);
rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
}
@@ -1579,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (is_nocb_cpu(rdp->cpu))
+ if (rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -1779,7 +2030,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
long bl, count, count_lazy;
int i;
- /* If no callbacks are ready, just return.*/
+ /* If no callbacks are ready, just return. */
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2259,20 @@ __rcu_process_callbacks(struct rcu_state *rsp)
WARN_ON_ONCE(rdp->beenonline == 0);
- /*
- * Advance callbacks in response to end of earlier grace
- * period that some other CPU ended.
- */
+ /* Handle the end of a grace period that some other CPU ended. */
rcu_process_gp_end(rsp, rdp);
/* Update RCU state based on any recent quiescent states. */
rcu_check_quiescent_state(rsp, rdp);
/* Does this CPU require a not-yet-started grace period? */
+ local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
- rcu_start_gp(rsp, flags); /* releases above lock */
+ raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ rcu_start_gp(rsp);
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ } else {
+ local_irq_restore(flags);
}
/* If there are callbacks ready, invoke them. */
@@ -2063,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
static void invoke_rcu_core(void)
{
- raise_softirq(RCU_SOFTIRQ);
+ if (cpu_online(smp_processor_id()))
+ raise_softirq(RCU_SOFTIRQ);
}
/*
@@ -2098,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
struct rcu_node *rnp_root = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ raw_spin_lock(&rnp_root->lock);
+ rcu_start_gp(rsp);
+ raw_spin_unlock(&rnp_root->lock);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2522,19 +2775,27 @@ static int rcu_pending(int cpu)
}
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.
+ * Return true if the specified CPU has any callback. If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
{
+ bool al = true;
+ bool hc = false;
+ struct rcu_data *rdp;
struct rcu_state *rsp;
- /* RCU callbacks either ready or pending? */
- for_each_rcu_flavor(rsp)
- if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
- return 1;
- return 0;
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->qlen != rdp->qlen_lazy)
+ al = false;
+ if (rdp->nxtlist)
+ hc = true;
+ }
+ if (all_lazy)
+ *all_lazy = al;
+ return hc;
}
/*
@@ -2641,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+ if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
- if (is_nocb_cpu(cpu)) {
+ if (rcu_is_nocb_cpu(cpu)) {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
@@ -2719,9 +2980,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
- WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
rdp->cpu = cpu;
rdp->rsp = rsp;
rcu_boot_init_nocb_percpu_data(rdp);
@@ -2756,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
- rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
/* Add CPU to rcu_node bitmasks. */
@@ -2806,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- int ret = NOTIFY_OK;
trace_rcu_utilization("Start CPU hotplug");
switch (action) {
@@ -2820,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
- if (nocb_cpu_expendable(cpu))
- rcu_boost_kthread_setaffinity(rnp, cpu);
- else
- ret = NOTIFY_BAD;
+ rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
- /*
- * The whole machine is "stopped" except this CPU, so we can
- * touch any data without introducing corruption. We send the
- * dying CPU's callbacks to an arbitrarily chosen online CPU.
- */
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
- rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -2847,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
}
trace_rcu_utilization("End CPU hotplug");
- return ret;
+ return NOTIFY_OK;
}
/*
@@ -2938,6 +3185,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ /* Silence gcc 4.8 warning about array index out of range. */
+ if (rcu_num_lvls > RCU_NUM_LVLS)
+ panic("rcu_init_one: rcu_num_lvls overflow");
+
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
@@ -2978,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
+ rcu_init_one_nocb(rnp);
}
}
@@ -3063,8 +3315,7 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
- rcu_init_nocb();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
@@ -3074,7 +3325,6 @@ void __init rcu_init(void)
cpu_notifier(rcu_cpu_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- check_cpu_stall_init();
}
#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,24 +88,15 @@ struct rcu_dynticks {
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- int dyntick_drain; /* Prepare-for-idle state variable. */
- unsigned long dyntick_holdoff;
- /* No retries for the jiffy of failure. */
- struct timer_list idle_gp_timer;
- /* Wake up CPU sleeping with callbacks. */
- unsigned long idle_gp_timer_expires;
- /* When to wake up CPU (for repost). */
- bool idle_first_pass; /* First pass of attempt to go idle? */
+ bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
/* # times non-lazy CBs posted to CPU. */
unsigned long nonlazy_posted_snap;
/* idle-period nonlazy_posted snapshot. */
+ unsigned long last_accelerate;
+ /* Last jiffy CBs were accelerated. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
- bool ignore_user_qs; /* Treat userspace as extended QS or not */
- bool in_user; /* Is the CPU in userland from RCU POV? */
-#endif
};
/* RCU's kthread states for tracing. */
@@ -138,9 +129,6 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
- atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
- /* Since this has meaning only for leaf */
- /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -200,6 +188,12 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_NOCB_CPU
+ wait_queue_head_t nocb_gp_wq[2];
+ /* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ int need_future_gp[2];
+ /* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
@@ -282,6 +276,8 @@ struct rcu_data {
*/
struct rcu_head *nxtlist;
struct rcu_head **nxttail[RCU_NEXT_SIZE];
+ unsigned long nxtcompleted[RCU_NEXT_SIZE];
+ /* grace periods for sublists. */
long qlen_lazy; /* # of lazy queued callbacks */
long qlen; /* # of queued callbacks, incl lazy */
long qlen_last_fqs_check;
@@ -330,6 +326,11 @@ struct rcu_data {
struct task_struct *nocb_kthread;
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ /* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
int cpu;
struct rcu_state *rsp;
};
@@ -343,11 +344,6 @@ struct rcu_data {
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
-#endif
#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
/* to take at least one */
/* scheduling clock irq */
@@ -382,12 +378,6 @@ struct rcu_state {
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
-#ifdef CONFIG_RCU_NOCB_CPU
- void (*call_remote)(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
- /* call_rcu() flavor, but for */
- /* placing on remote CPU. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* The following fields are guarded by the root rcu_node's lock. */
@@ -450,6 +440,7 @@ struct rcu_state {
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
char *name; /* Name of structure. */
+ char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
};
@@ -527,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
static void rcu_idle_count_callbacks_posted(void);
@@ -536,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
-static bool is_nocb_cpu(int cpu);
+static int rcu_nocb_needs_gp(struct rcu_state *rsp);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy);
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp);
-static bool nocb_cpu_expendable(int cpu);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void init_nocb_callback_list(struct rcu_data *rdp);
-static void __init rcu_init_nocb(void);
+static void rcu_kick_nohz_cpu(int cpu);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..170814dc418f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/smpboot.h>
+#include <linux/tick.h>
#define RCU_KTHREAD_PRIO 1
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
if (nr_cpu_ids != NR_CPUS)
printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_NOCB_CPU
+#ifndef CONFIG_RCU_NOCB_CPU_NONE
+ if (!have_rcu_nocb_mask) {
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ have_rcu_nocb_mask = true;
+ }
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tExperimental no-CBs CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tExperimental no-CBs for all CPUs\n");
+ cpumask_setall(rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
if (have_rcu_nocb_mask) {
- if (cpumask_test_cpu(0, rcu_nocb_mask)) {
- cpumask_clear_cpu(0, rcu_nocb_mask);
- pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
- }
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
if (rcu_nocb_poll)
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_state rcu_preempt_state =
- RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
+ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
+ return rcu_cpu_has_callbacks(cpu, NULL);
}
/*
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
*
* The following three proprocessor symbols control this state machine:
*
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- * to satisfy RCU. Beyond this point, it is better to incur a periodic
- * scheduling-clock interrupt than to loop through the state machine
- * at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- * optional if RCU does not need anything immediately from this
- * CPU, even if this CPU still has RCU callbacks queued. The first
- * times through the state machine are mandatory: we need to give
- * the state machine a chance to communicate a quiescent state
- * to the RCU core.
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
* is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
-#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-extern int tick_nohz_enabled;
-
-/*
- * Does the specified flavor of RCU have non-lazy callbacks pending on
- * the specified CPU? Both RCU flavor and CPU are specified by the
- * rcu_data structure.
- */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
-{
- return rdp->qlen != rdp->qlen_lazy;
-}
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+extern int tick_nohz_enabled;
/*
- * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
- * is no RCU-preempt in the kernel.)
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
*/
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+static bool rcu_try_advance_all_cbs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
- return __rcu_cpu_has_nonlazy_callbacks(rdp);
-}
-
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ bool cbs_ready = false;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
-{
- return 0;
-}
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+ /*
+ * Don't bother checking unless a grace period has
+ * completed since we last checked and there are
+ * callbacks not yet ready to invoke.
+ */
+ if (rdp->completed != rnp->completed &&
+ rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ rcu_process_gp_end(rsp, rdp);
-/*
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
- */
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
-{
- return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
- __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
- rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ cbs_ready = true;
+ }
+ return cbs_ready;
}
/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it. After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke. If the CPU has callbacks, try to advance them. Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
*
- * The delta_jiffies argument is used to store the time when RCU is
- * going to need the CPU again if it still has callbacks. The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU. This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
+ * The caller must have disabled interrupts.
*/
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(int cpu, unsigned long *dj)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Flag a new idle sojourn to the idle-entry state machine. */
- rdtp->idle_first_pass = 1;
+ /* Snapshot to detect later posting of non-lazy callback. */
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu)) {
- *delta_jiffies = ULONG_MAX;
+ if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ *dj = ULONG_MAX;
return 0;
}
- if (rdtp->dyntick_holdoff == jiffies) {
- /* RCU recently tried and failed, so don't try again. */
- *delta_jiffies = 1;
+
+ /* Attempt to advance callbacks. */
+ if (rcu_try_advance_all_cbs()) {
+ /* Some ready to invoke, so initiate later invocation. */
+ invoke_rcu_core();
return 1;
}
- /* Set up for the possibility that RCU will post a timer. */
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
- RCU_IDLE_GP_DELAY) - jiffies;
+ rdtp->last_accelerate = jiffies;
+
+ /* Request timer delay depending on laziness, and round. */
+ if (rdtp->all_lazy) {
+ *dj = round_up(rcu_idle_gp_delay + jiffies,
+ rcu_idle_gp_delay) - jiffies;
} else {
- *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
- *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+ *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
return 0;
}
/*
- * Handler for smp_call_function_single(). The only point of this
- * handler is to wake the CPU up, so the handler does only tracing.
- */
-void rcu_idle_demigrate(void *unused)
-{
- trace_rcu_prep_idle("Demigrate");
-}
-
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending. The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
- *
- * One special case: the timer gets migrated without awakening the CPU
- * on which the timer was scheduled on. In this case, we must wake up
- * that CPU. We do so with smp_call_function_single().
- */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
-{
- int cpu = (int)cpu_in;
-
- trace_rcu_prep_idle("Timer");
- if (cpu != smp_processor_id())
- smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
- else
- WARN_ON_ONCE(1); /* Getting here can hang the system... */
-}
-
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- rdtp->dyntick_holdoff = jiffies - 1;
- setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
- rdtp->idle_gp_timer_expires = jiffies - 1;
- rdtp->idle_first_pass = 1;
-}
-
-/*
- * Clean up for exit from idle. Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it. This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- del_timer(&rdtp->idle_gp_timer);
- trace_rcu_prep_idle("Cleanup after idle");
- rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
-}
-
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done. This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode. In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later. The ->dyntick_drain field controls the sequencing.
+ * Prepare a CPU for idle from an RCU perspective. The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks. The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
static void rcu_prepare_for_idle(int cpu)
{
- struct timer_list *tp;
+ struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
int tne;
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_enabled);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu))
+ if (rcu_cpu_has_callbacks(cpu, NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
if (!tne)
return;
- /* Adaptive-tick mode, where usermode execution is idle to RCU. */
- if (!is_idle_task(current)) {
- rdtp->dyntick_holdoff = jiffies - 1;
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- trace_rcu_prep_idle("User dyntick with callbacks");
- rdtp->idle_gp_timer_expires =
- round_up(jiffies + RCU_IDLE_GP_DELAY,
- RCU_IDLE_GP_DELAY);
- } else if (rcu_cpu_has_callbacks(cpu)) {
- rdtp->idle_gp_timer_expires =
- round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
- trace_rcu_prep_idle("User dyntick with lazy callbacks");
- } else {
- return;
- }
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+ /* If this is a no-CBs CPU, no callbacks, just return. */
+ if (rcu_is_nocb_cpu(cpu))
return;
- }
/*
- * If this is an idle re-entry, for example, due to use of
- * RCU_NONIDLE() or the new idle-loop tracing API within the idle
- * loop, then don't take any state-machine actions, unless the
- * momentary exit from idle queued additional non-lazy callbacks.
- * Instead, repost the ->idle_gp_timer if this CPU has callbacks
- * pending.
+ * If a non-lazy callback arrived at a CPU having only lazy
+ * callbacks, invoke RCU core for the side-effect of recalculating
+ * idle duration on re-entry to idle.
*/
- if (!rdtp->idle_first_pass &&
- (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
- if (rcu_cpu_has_callbacks(cpu)) {
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
- }
+ if (rdtp->all_lazy &&
+ rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+ invoke_rcu_core();
return;
}
- rdtp->idle_first_pass = 0;
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
/*
- * If there are no callbacks on this CPU, enter dyntick-idle mode.
- * Also reset state to avoid prejudicing later attempts.
+ * If we have not yet accelerated this jiffy, accelerate all
+ * callbacks on this CPU.
*/
- if (!rcu_cpu_has_callbacks(cpu)) {
- rdtp->dyntick_holdoff = jiffies - 1;
- rdtp->dyntick_drain = 0;
- trace_rcu_prep_idle("No callbacks");
+ if (rdtp->last_accelerate == jiffies)
return;
+ rdtp->last_accelerate = jiffies;
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (!*rdp->nxttail[RCU_DONE_TAIL])
+ continue;
+ rnp = rdp->mynode;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
+}
- /*
- * If in holdoff mode, just return. We will presumably have
- * refrained from disabling the scheduling-clock tick.
- */
- if (rdtp->dyntick_holdoff == jiffies) {
- trace_rcu_prep_idle("In holdoff");
- return;
- }
+/*
+ * Clean up for exit from idle. Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+ struct rcu_data *rdp;
+ struct rcu_state *rsp;
- /* Check and update the ->dyntick_drain sequencing. */
- if (rdtp->dyntick_drain <= 0) {
- /* First time through, initialize the counter. */
- rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
- } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
- !rcu_pending(cpu) &&
- !local_softirq_pending()) {
- /* Can we go dyntick-idle despite still having callbacks? */
- rdtp->dyntick_drain = 0;
- rdtp->dyntick_holdoff = jiffies;
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- trace_rcu_prep_idle("Dyntick with callbacks");
- rdtp->idle_gp_timer_expires =
- round_up(jiffies + RCU_IDLE_GP_DELAY,
- RCU_IDLE_GP_DELAY);
- } else {
- rdtp->idle_gp_timer_expires =
- round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
- trace_rcu_prep_idle("Dyntick with lazy callbacks");
- }
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
- return; /* Nothing more to do immediately. */
- } else if (--(rdtp->dyntick_drain) <= 0) {
- /* We have hit the limit, so time to give up. */
- rdtp->dyntick_holdoff = jiffies;
- trace_rcu_prep_idle("Begin holdoff");
- invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
+ if (rcu_is_nocb_cpu(cpu))
return;
- }
-
- /*
- * Do one step of pushing the remaining RCU callbacks through
- * the RCU core state machine.
- */
-#ifdef CONFIG_TREE_PREEMPT_RCU
- if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
- rcu_preempt_qs(cpu);
- force_quiescent_state(&rcu_preempt_state);
- }
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- if (per_cpu(rcu_sched_data, cpu).nxtlist) {
- rcu_sched_qs(cpu);
- force_quiescent_state(&rcu_sched_state);
- }
- if (per_cpu(rcu_bh_data, cpu).nxtlist) {
- rcu_bh_qs(cpu);
- force_quiescent_state(&rcu_bh_state);
- }
-
- /*
- * If RCU callbacks are still pending, RCU still needs this CPU.
- * So try forcing the callbacks through the grace period.
- */
- if (rcu_cpu_has_callbacks(cpu)) {
- trace_rcu_prep_idle("More callbacks");
- invoke_rcu_core();
- } else {
- trace_rcu_prep_idle("Callbacks drained");
+ rcu_try_advance_all_cbs();
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_core();
}
}
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- struct timer_list *tltp = &rdtp->idle_gp_timer;
- char c;
+ unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
- c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
- if (timer_pending(tltp))
- sprintf(cp, "drain=%d %c timer=%lu",
- rdtp->dyntick_drain, c, tltp->expires - jiffies);
- else
- sprintf(cp, "drain=%d %c timer not pending",
- rdtp->dyntick_drain, c);
+ sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ulong2long(nlpd),
+ rdtp->all_lazy ? 'L' : '.',
+ rdtp->tick_nohz_enabled_snap ? '.' : 'D');
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+ printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
fast_no_hz);
}
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
{
rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
}
/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg)
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+/*
+ * Do any no-CBs CPUs need another grace period?
+ *
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures. This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+ rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+ init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
+
/* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
+bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy)
{
- if (!is_nocb_cpu(rdp->cpu))
+ if (!rcu_is_nocb_cpu(rdp->cpu))
return 0;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+ if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+ trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+ (unsigned long)rhp->func,
+ rdp->qlen_lazy, rdp->qlen);
+ else
+ trace_rcu_callback(rdp->rsp->name, rhp,
+ rdp->qlen_lazy, rdp->qlen);
return 1;
}
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
long qll = rsp->qlen_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
- if (!is_nocb_cpu(smp_processor_id()))
+ if (!rcu_is_nocb_cpu(smp_processor_id()))
return 0;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
}
/*
- * There must be at least one non-no-CBs CPU in operation at any given
- * time, because no-CBs CPUs are not capable of initiating grace periods
- * independently. This function therefore complains if the specified
- * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
- * avoid offlining the last such CPU. (Recursion is a wonderful thing,
- * but you have to have a base case!)
+ * If necessary, kick off a new grace period, and either way wait
+ * for a subsequent grace period to complete.
*/
-static bool nocb_cpu_expendable(int cpu)
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
{
- cpumask_var_t non_nocb_cpus;
- int ret;
+ unsigned long c;
+ bool d;
+ unsigned long flags;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ c = rcu_start_future_gp(rnp, rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
- * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
- * then offlining this CPU is harmless. Let it happen.
+ * Wait for the grace period. Do so interruptibly to avoid messing
+ * up the load average.
*/
- if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
- return 1;
-
- /* If no memory, play it safe and keep the CPU around. */
- if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
- return 0;
- cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
- cpumask_clear_cpu(cpu, non_nocb_cpus);
- ret = !cpumask_empty(non_nocb_cpus);
- free_cpumask_var(non_nocb_cpus);
- return ret;
-}
-
-/*
- * Helper structure for remote registry of RCU callbacks.
- * This is needed for when a no-CBs CPU needs to start a grace period.
- * If it just invokes call_rcu(), the resulting callback will be queued,
- * which can result in deadlock.
- */
-struct rcu_head_remote {
- struct rcu_head *rhp;
- call_rcu_func_t *crf;
- void (*func)(struct rcu_head *rhp);
-};
-
-/*
- * Register a callback as specified by the rcu_head_remote struct.
- * This function is intended to be invoked via smp_call_function_single().
- */
-static void call_rcu_local(void *arg)
-{
- struct rcu_head_remote *rhrp =
- container_of(arg, struct rcu_head_remote, rhp);
-
- rhrp->crf(rhrp->rhp, rhrp->func);
-}
-
-/*
- * Set up an rcu_head_remote structure and the invoke call_rcu_local()
- * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
- * smp_call_function_single().
- */
-static void invoke_crf_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp),
- call_rcu_func_t crf)
-{
- struct rcu_head_remote rhr;
-
- rhr.rhp = rhp;
- rhr.crf = crf;
- rhr.func = func;
- smp_call_function_single(0, call_rcu_local, &rhr, 1);
-}
-
-/*
- * Helper functions to be passed to wait_rcu_gp(), each of which
- * invokes invoke_crf_remote() to register a callback appropriately.
- */
-static void __maybe_unused
-call_rcu_preempt_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu);
-}
-static void call_rcu_bh_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu_bh);
-}
-static void call_rcu_sched_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu_sched);
+ trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+ for (;;) {
+ wait_event_interruptible(
+ rnp->nocb_gp_wq[c & 0x1],
+ (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ if (likely(d))
+ break;
+ flush_signals(current);
+ trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+ }
+ trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+ smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
/*
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
ACCESS_ONCE(rdp->nocb_p_count) += c;
ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- wait_rcu_gp(rdp->rsp->call_remote);
+ rcu_nocb_wait_gp(rdp);
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
return;
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+ t = kthread_run(rcu_nocb_kthread, rdp,
+ "rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
ACCESS_ONCE(rdp->nocb_kthread) = t;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
{
if (rcu_nocb_mask == NULL ||
!cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
- return;
+ return false;
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ return true;
}
-/* Initialize the ->call_remote fields in the rcu_state structures. */
-static void __init rcu_init_nocb(void)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
{
-#ifdef CONFIG_PREEMPT_RCU
- rcu_preempt_state.call_remote = call_rcu_preempt_remote;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
- rcu_bh_state.call_remote = call_rcu_bh_remote;
- rcu_sched_state.call_remote = call_rcu_sched_remote;
+ return 0;
}
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
-static bool is_nocb_cpu(int cpu)
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- return false;
}
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
return 0;
}
-static bool nocb_cpu_expendable(int cpu)
-{
- return 1;
-}
-
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
}
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
{
+ return false;
}
-static void __init rcu_init_nocb(void)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off. RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled. Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void rcu_kick_nohz_cpu(int cpu)
{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(cpu))
+ smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..cf6c17412932 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
-#define ulong2long(a) (*(long *)(&(a)))
-
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
.open = rcubarrier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
.open = rcuexp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
.open = rcuhier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = {
.open = rcugp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..b91488ba2e5a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
- buf->chan->cb->remove_buf_file(buf->dentry);
relay_destroy_buf(buf);
}
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
del_timer_sync(&buf->timer);
+ buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
- chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
static int subbuf_read_actor(size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor)
+ read_descriptor_t *desc)
{
void *from;
int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
typedef int (*subbuf_actor_t) (size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor);
+ read_descriptor_t *desc);
/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
subbuf_actor_t subbuf_actor,
- read_actor_t actor,
read_descriptor_t *desc)
{
struct rchan_buf *buf = filp->private_data;
@@ -1139,7 +1136,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
if (!desc->count)
return 0;
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_lock(&file_inode(filp)->i_mutex);
do {
if (!relay_file_read_avail(buf, *ppos))
break;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
break;
avail = min(desc->count, avail);
- ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ ret = subbuf_actor(read_start, buf, avail, desc);
if (desc->error < 0)
break;
@@ -1159,7 +1156,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
*ppos = relay_file_read_end_pos(buf, read_start, ret);
}
} while (desc->count && ret);
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&file_inode(filp)->i_mutex);
return desc->written;
}
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
desc.count = count;
desc.arg.buf = buffer;
desc.error = 0;
- return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
- NULL, &desc);
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/io.h>
@@ -50,6 +51,14 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
+static void free_resource(struct resource *res)
+{
+ if (!res)
+ return;
+
+ if (!PageSlab(virt_to_head_page(res))) {
+ spin_lock(&bootmem_resource_lock);
+ res->sibling = bootmem_resource_free;
+ bootmem_resource_free = res;
+ spin_unlock(&bootmem_resource_lock);
+ } else {
+ kfree(res);
+ }
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+ struct resource *res = NULL;
+
+ spin_lock(&bootmem_resource_lock);
+ if (bootmem_resource_free) {
+ res = bootmem_resource_free;
+ bootmem_resource_free = res->sibling;
+ }
+ spin_unlock(&bootmem_resource_lock);
+
+ if (res)
+ memset(res, 0, sizeof(struct resource));
+ else
+ res = kzalloc(sizeof(struct resource), flags);
+
+ return res;
+}
+
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
-/**
- * adjust_resource - modify a resource's start and size
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments. Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+static int __adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
resource_size_t end = start + size - 1;
int result = -EBUSY;
- write_lock(&resource_lock);
-
if (!parent)
goto skip;
@@ -751,6 +783,26 @@ skip:
result = 0;
out:
+ return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ int result;
+
+ write_lock(&resource_lock);
+ result = __adjust_resource(res, start, size);
write_unlock(&resource_lock);
return result;
}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
/* conflict covered whole area */
if (conflict->start <= res->start &&
conflict->end >= res->end) {
- kfree(res);
+ free_resource(res);
WARN_ON(next_res);
break;
}
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
end = res->end;
res->end = conflict->start - 1;
if (conflict->end < end) {
- next_res = kzalloc(sizeof(*next_res),
- GFP_ATOMIC);
+ next_res = alloc_resource(GFP_ATOMIC);
if (!next_res) {
- kfree(res);
+ free_resource(res);
break;
}
next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = alloc_resource(GFP_KERNEL);
if (!res)
return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- kfree(res);
+ free_resource(res);
res = NULL;
break;
}
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
return -EBUSY;
release_resource(res);
- kfree(res);
+ free_resource(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
if (res->flags & IORESOURCE_MUXED)
wake_up(&muxed_resource_wait);
- kfree(res);
+ free_resource(res);
return;
}
p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
}
EXPORT_SYMBOL(__release_region);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete. The requested region
+ * is released from a currently busy memory resource. The requested region
+ * must either match exactly or fit into a single busy resource entry. In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ * supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ * assumes that all children remain in the lower address entry for
+ * simplicity. Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+ resource_size_t start, resource_size_t size)
+{
+ struct resource **p;
+ struct resource *res;
+ struct resource *new_res;
+ resource_size_t end;
+ int ret = -EINVAL;
+
+ end = start + size - 1;
+ if ((start < parent->start) || (end > parent->end))
+ return ret;
+
+ /* The alloc_resource() result gets checked later */
+ new_res = alloc_resource(GFP_KERNEL);
+
+ p = &parent->child;
+ write_lock(&resource_lock);
+
+ while ((res = *p)) {
+ if (res->start >= end)
+ break;
+
+ /* look for the next resource if it does not fit into */
+ if (res->start > start || res->end < end) {
+ p = &res->sibling;
+ continue;
+ }
+
+ if (!(res->flags & IORESOURCE_MEM))
+ break;
+
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+
+ /* found the target resource; let's adjust accordingly */
+ if (res->start == start && res->end == end) {
+ /* free the whole entry */
+ *p = res->sibling;
+ free_resource(res);
+ ret = 0;
+ } else if (res->start == start && res->end != end) {
+ /* adjust the start */
+ ret = __adjust_resource(res, end + 1,
+ res->end - end);
+ } else if (res->start != start && res->end == end) {
+ /* adjust the end */
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ } else {
+ /* split into two entries */
+ if (!new_res) {
+ ret = -ENOMEM;
+ break;
+ }
+ new_res->name = res->name;
+ new_res->start = end + 1;
+ new_res->end = res->end;
+ new_res->flags = res->flags;
+ new_res->parent = res->parent;
+ new_res->sibling = res->sibling;
+ new_res->child = NULL;
+
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ if (ret)
+ break;
+ res->sibling = new_res;
+ new_res = NULL;
+ }
+
+ break;
+ }
+
+ write_unlock(&resource_lock);
+ free_resource(new_res);
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
/*
* Managed region resource
*/
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
* See rt.c in preempt-rt for proper credits and further information
*/
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,9 +10,11 @@
#include <linux/kthread.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/stat.h>
#include "rtmutex.h"
@@ -365,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
#include <linux/spinlock.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd847..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
EXPORT_SYMBOL(_down_write_nest_lock);
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
+ sched_offline_group(ag->tg);
sched_destroy_group(ag->tg);
}
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
+ sched_online_group(tg, &root_task_group);
+
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0bebba..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
#endif
#include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
#include "../smpboot.h"
#define CREATE_TRACE_POINTS
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
* the target CPU.
*/
#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
-
void resched_task(struct task_struct *p)
{
int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ smp_send_reschedule(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
}
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
static inline bool got_nohz_idle_kick(void)
{
return false;
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ struct rq *rq;
+
+ rq = this_rq();
+
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
+
+ /* More than one running task need preemption */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
@@ -1132,18 +1163,28 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ int nid = cpu_to_node(cpu);
+ const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
- /* Look for allowed, online CPU in same node. */
- for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- return dest_cpu;
+ /*
+ * If the node that the cpu is on has been offlined, cpu_to_node()
+ * will return -1. There is no cpu on the node, and we should
+ * select the cpu on the other node.
+ */
+ if (nid != -1) {
+ nodemask = cpumask_of_node(nid);
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return dest_cpu;
+ }
}
for (;;) {
@@ -1278,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -1352,7 +1393,8 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+ && !tick_nohz_full_cpu(smp_processor_id()))
return;
/*
@@ -1369,6 +1411,7 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
+ tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -1488,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1742,9 +1787,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
@@ -1753,9 +1797,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
struct preempt_notifier *notifier;
- struct hlist_node *node;
- hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
notifier->ops->sched_out(notifier, next);
}
@@ -1850,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
kprobe_flush_task(prev);
put_task_struct(prev);
}
+
+ tick_nohz_task_switch(current);
}
#ifdef CONFIG_SMP
@@ -1969,11 +2014,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
@@ -1985,23 +2029,6 @@ unsigned long nr_running(void)
return sum;
}
-unsigned long nr_uninterruptible(void)
-{
- unsigned long i, sum = 0;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_uninterruptible;
-
- /*
- * Since we read the counters lockless, it might be slightly
- * inaccurate. Do not allow it to go below zero though:
- */
- if (unlikely((long)sum < 0))
- sum = 0;
-
- return sum;
-}
-
unsigned long long nr_context_switches(void)
{
int i;
@@ -2131,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
return load >> FSHIFT;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Handle NO_HZ for the global load-average.
*
@@ -2357,12 +2384,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
static inline long calc_load_fold_idle(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2522,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2582,7 +2609,7 @@ void update_cpu_load_nohz(void)
}
raw_spin_unlock(&this_rq->lock);
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from scheduler_tick()
@@ -2709,7 +2736,34 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
+ rq_last_tick_reset(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
}
+#endif
notrace unsigned long get_parent_ip(unsigned long addr)
{
@@ -2786,7 +2840,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (irqs_disabled())
print_irqtrace_events(prev);
dump_stack();
- add_taint(TAINT_WARN);
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
/*
@@ -3007,51 +3061,6 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
-}
-#endif
-
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -3092,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
asmlinkage void __sched preempt_schedule_irq(void)
{
struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- user_exit();
+ prev_state = exception_enter();
+
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
@@ -3110,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
barrier();
} while (need_resched());
+
+ exception_exit(prev_state);
}
#endif /* CONFIG_PREEMPT */
@@ -3268,7 +3281,8 @@ void complete_all(struct completion *x)
EXPORT_SYMBOL(complete_all);
static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3281,7 +3295,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
- timeout = schedule_timeout(timeout);
+ timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
@@ -3292,17 +3306,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
return timeout ?: 1;
}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+static inline long __sched
+__wait_for_common(struct completion *x,
+ long (*action)(long), long timeout, int state)
{
might_sleep();
spin_lock_irq(&x->wait.lock);
- timeout = do_wait_for_common(x, timeout, state);
+ timeout = do_wait_for_common(x, action, timeout, state);
spin_unlock_irq(&x->wait.lock);
return timeout;
}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+ return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+
/**
* wait_for_completion: - waits for completion of a task
* @x: holds the state of this particular completion
@@ -3339,6 +3366,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
EXPORT_SYMBOL(wait_for_completion_timeout);
/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+ wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+ return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+
+/**
* wait_for_completion_interruptible: - waits for completion of a task (w/intr)
* @x: holds the state of this particular completion
*
@@ -4089,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p);
rcu_read_unlock();
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
@@ -4364,20 +4428,32 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
bool __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
unsigned long flags;
- bool yielded = 0;
+ int yielded = 0;
local_irq_save(flags);
rq = this_rq();
again:
p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+ yielded = -ESRCH;
+ goto out_irq;
+ }
+
double_rq_lock(rq, p_rq);
while (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
@@ -4385,13 +4461,13 @@ again:
}
if (!curr->sched_class->yield_to_task)
- goto out;
+ goto out_unlock;
if (curr->sched_class != p->sched_class)
- goto out;
+ goto out_unlock;
if (task_running(p_rq, p) || p->state)
- goto out;
+ goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4404,11 +4480,12 @@ again:
resched_task(p_rq->curr);
}
-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);
- if (yielded)
+ if (yielded > 0)
schedule();
return yielded;
@@ -4576,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
+ print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
}
@@ -4667,6 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
+ vtime_init_idle(idle);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
@@ -4722,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
- ret = -EINVAL;
- goto out;
- }
-
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
@@ -4948,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
}
static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
@@ -6197,7 +6271,7 @@ static void sched_init_numa(void)
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
*
- * The sched_domains_nume_distance[] array includes the actual distance
+ * The sched_domains_numa_distance[] array includes the actual distance
* numbers.
*/
@@ -6810,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
}
#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
struct task_group root_task_group;
LIST_HEAD(task_groups);
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6851,7 +6929,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
- per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6877,12 +6955,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
- root_cpuacct.cpustat = &kernel_cpustat;
- root_cpuacct.cpuusage = alloc_percpu(u64);
- /* Too early, not expected to fail */
- BUG_ON(!root_cpuacct.cpuusage);
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -6946,9 +7018,12 @@ void __init sched_init(void)
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -7160,7 +7235,6 @@ static void free_sched_group(struct task_group *tg)
struct task_group *sched_create_group(struct task_group *parent)
{
struct task_group *tg;
- unsigned long flags;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
@@ -7172,6 +7246,17 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ return tg;
+
+err:
+ free_sched_group(tg);
+ return ERR_PTR(-ENOMEM);
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
spin_lock_irqsave(&task_group_lock, flags);
list_add_rcu(&tg->list, &task_groups);
@@ -7181,12 +7266,6 @@ struct task_group *sched_create_group(struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- return tg;
-
-err:
- free_sched_group(tg);
- return ERR_PTR(-ENOMEM);
}
/* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7278,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
unsigned long flags;
int i;
@@ -7210,9 +7295,6 @@ void sched_destroy_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
}
/* change task's runqueue when it moves between groups.
@@ -7397,7 +7479,7 @@ unlock:
return err;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
@@ -7409,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
@@ -7421,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
{
u64 rt_runtime, rt_period;
@@ -7434,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;
@@ -7469,7 +7551,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7508,6 +7590,25 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ /* make sure that internally we keep jiffies */
+ /* also, writing zero resets timeslice to default */
+ if (!ret && write) {
+ sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+ RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ }
+ mutex_unlock(&mutex);
+ return ret;
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -7564,6 +7665,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
return &tg->css;
}
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct task_group *parent;
+
+ if (!cgrp->parent)
+ return 0;
+
+ parent = cgroup_tg(cgrp->parent);
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_free(struct cgroup *cgrp)
{
struct task_group *tg = cgroup_tg(cgrp);
@@ -7571,6 +7685,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
sched_destroy_group(tg);
}
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+
+ sched_offline_group(tg);
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -7926,6 +8047,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
+ .css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
@@ -7936,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-struct cpuacct root_cpuacct;
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
- struct cpuacct *ca;
-
- if (!cgrp->parent)
- return &root_cpuacct.css;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- goto out;
-
- ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage)
- goto out_free_ca;
-
- ca->cpustat = alloc_percpu(struct kernel_cpustat);
- if (!ca->cpustat)
- goto out_free_cpuusage;
-
- return &ca->css;
-
-out_free_cpuusage:
- free_percpu(ca->cpuusage);
-out_free_ca:
- kfree(ca);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
-
- free_percpu(ca->cpustat);
- free_percpu(ca->cpuusage);
- kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 data;
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
-
- return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- u64 totalcpuusage = 0;
- int i;
-
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
- return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
- }
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
-{
- struct cpuacct *ca = cgroup_ca(cgroup);
- u64 percpu;
- int i;
-
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
- seq_printf(m, "%llu ", (unsigned long long) percpu);
- }
- seq_printf(m, "\n");
- return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int cpu;
- s64 val = 0;
-
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
- }
-
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
- return 0;
-}
-
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
- {
- .name = "usage_percpu",
- .read_seq_string = cpuacct_percpu_seq_read,
- },
- {
- .name = "stat",
- .read_map = cpuacct_stats_show,
- },
- { } /* terminate */
-};
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
- struct cpuacct *ca;
- int cpu;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- cpu = task_cpu(tsk);
-
- rcu_read_lock();
-
- ca = task_ca(tsk);
-
- for (; ca; ca = parent_ca(ca)) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
- }
-
- rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
- .css_alloc = cpuacct_css_alloc,
- .css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
- .base_cftypes = files,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
-
void dump_cpu_task(int cpu)
{
pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
+{
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+{
+ struct cpuacct *ca;
+
+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup *cgrp)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+ u64 reset)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = __parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_subsys = {
+ .name = "cpuacct",
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .subsys_id = cpuacct_subsys_id,
+ .base_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
*/
#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..cc2dc3eea8a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
+#include <linux/context_tracking.h>
#include "sched.h"
@@ -114,10 +115,6 @@ static int irqtime_account_si_update(void)
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
@@ -126,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
-#endif
+ cpuacct_account_field(p, index, tmp);
}
/*
@@ -163,7 +148,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
task_group_account_field(p, index, (__force u64) cputime);
/* Account for user time used */
- acct_update_integrals(p);
+ acct_account_cputime(p);
}
/*
@@ -213,7 +198,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
task_group_account_field(p, index, (__force u64) cputime);
/* Account for system time used */
- acct_update_integrals(p);
+ acct_account_cputime(p);
}
/*
@@ -295,6 +280,7 @@ static __always_inline bool steal_account_process_tick(void)
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
{
struct signal_struct *sig = tsk->signal;
+ cputime_t utime, stime;
struct task_struct *t;
times->utime = sig->utime;
@@ -308,16 +294,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- times->utime += t->utime;
- times->stime += t->stime;
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
} while_each_thread(tsk, t);
out:
rcu_read_unlock();
}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
@@ -382,12 +367,90 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick(current, 0, rq);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
@@ -397,6 +460,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (vtime_accounting_enabled())
+ return;
+
if (sched_clock_irqtime) {
irqtime_account_process_tick(p, user_tick, rq);
return;
@@ -439,88 +505,49 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(jiffies_to_cputime(ticks));
}
-#endif
-
/*
- * Use precise platform statistics if available:
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
- unsigned long flags;
-
- local_irq_save(flags);
- vtime_account_system(tsk);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
-
- vtime_account_user(prev);
- arch_vtime_task_switch(prev);
-}
-#endif
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
-{
- if (in_interrupt() || !is_idle_task(tsk))
- vtime_account_system(tsk);
- else
- vtime_account_idle(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
-#else
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
-
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
-{
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
-
- if (sizeof(cputime_t) == 4)
- temp = div_u64(temp, (__force u32) total);
- else
- temp = div64_u64(temp, (__force u64) total);
+ u64 scaled;
+
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime) {
+ u64 tmp = rtime; rtime = stime; stime = tmp;
+ }
+
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
+
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
+
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
+
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
+
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
- return (__force cputime_t) temp;
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
}
/*
@@ -531,10 +558,16 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, utime, total;
+ cputime_t rtime, stime, utime, total;
+
+ if (vtime_accounting_enabled()) {
+ *ut = curr->utime;
+ *st = curr->stime;
+ return;
+ }
- utime = curr->utime;
- total = utime + curr->stime;
+ stime = curr->stime;
+ total = stime + curr->utime;
/*
* Tick based cputime accounting depend on random scheduling
@@ -548,19 +581,32 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (total)
- utime = scale_utime(utime, rtime, total);
- else
- utime = rtime;
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ if (total) {
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ } else {
+ stime = rtime;
+ utime = 0;
+ }
/*
* If the tick based count grows faster than the scheduler one,
* the result of the scaling may go backward.
* Let's enforce monotonicity.
*/
+ prev->stime = max(prev->stime, stime);
prev->utime = max(prev->utime, utime);
- prev->stime = max(prev->stime, rtime - prev->utime);
+out:
*ut = prev->utime;
*st = prev->stime;
}
@@ -568,11 +614,10 @@ static void cputime_adjust(struct task_cputime *curr,
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime = {
- .utime = p->utime,
- .stime = p->stime,
.sum_exec_runtime = p->se.sum_exec_runtime,
};
+ task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
@@ -586,4 +631,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long clock;
+
+ clock = local_clock();
+ if (clock < tsk->vtime_snap)
+ return 0;
+
+ return clock - tsk->vtime_snap;
+}
+
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+ unsigned long long delta = vtime_delta(tsk);
+
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ tsk->vtime_snap += delta;
+
+ /* CHECKME: always safe to convert nsecs to cputime? */
+ return nsecs_to_cputime(delta);
+}
+
+static void __vtime_account_system(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+
+void vtime_account_system(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ if (context_tracking_in_user())
+ tsk->vtime_snap_whence = VTIME_USER;
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_user(struct task_struct *tsk)
+{
+ cputime_t delta_cpu;
+
+ if (!vtime_accounting_enabled())
+ return;
+
+ delta_cpu = get_vtime_delta(tsk);
+
+ write_seqlock(&tsk->vtime_seqlock);
+ tsk->vtime_snap_whence = VTIME_SYS;
+ account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_user_enter(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ write_seqlock(&tsk->vtime_seqlock);
+ tsk->vtime_snap_whence = VTIME_USER;
+ __vtime_account_system(tsk);
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_enter(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_guest_exit(struct task_struct *tsk)
+{
+ write_seqlock(&tsk->vtime_seqlock);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+ write_sequnlock(&tsk->vtime_seqlock);
+}
+
+void vtime_account_idle(struct task_struct *tsk)
+{
+ cputime_t delta_cpu = get_vtime_delta(tsk);
+
+ account_idle_time(delta_cpu);
+}
+
+bool vtime_accounting_enabled(void)
+{
+ return context_tracking_active();
+}
+
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+ write_seqlock(&prev->vtime_seqlock);
+ prev->vtime_snap_whence = VTIME_SLEEPING;
+ write_sequnlock(&prev->vtime_seqlock);
+
+ write_seqlock(&current->vtime_seqlock);
+ current->vtime_snap_whence = VTIME_SYS;
+ current->vtime_snap = sched_clock();
+ write_sequnlock(&current->vtime_seqlock);
+}
+
+void vtime_init_idle(struct task_struct *t)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&t->vtime_seqlock, flags);
+ t->vtime_snap_whence = VTIME_SYS;
+ t->vtime_snap = sched_clock();
+ write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+
+cputime_t task_gtime(struct task_struct *t)
+{
+ unsigned int seq;
+ cputime_t gtime;
+
+ do {
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ gtime = t->gtime;
+ if (t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+
+ return gtime;
+}
+
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+ cputime_t *u_dst, cputime_t *s_dst,
+ cputime_t *u_src, cputime_t *s_src,
+ cputime_t *udelta, cputime_t *sdelta)
+{
+ unsigned int seq;
+ unsigned long long delta;
+
+ do {
+ *udelta = 0;
+ *sdelta = 0;
+
+ seq = read_seqbegin(&t->vtime_seqlock);
+
+ if (u_dst)
+ *u_dst = *u_src;
+ if (s_dst)
+ *s_dst = *s_src;
+
+ /* Task is sleeping, nothing to add */
+ if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+ if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+ *udelta = delta;
+ } else {
+ if (t->vtime_snap_whence == VTIME_SYS)
+ *sdelta = delta;
+ }
+ } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+
+
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utime, stime, &t->utime,
+ &t->stime, &udelta, &sdelta);
+ if (utime)
+ *utime += udelta;
+ if (stime)
+ *stime += sdelta;
+}
+
+void task_cputime_scaled(struct task_struct *t,
+ cputime_t *utimescaled, cputime_t *stimescaled)
+{
+ cputime_t udelta, sdelta;
+
+ fetch_task_cputime(t, utimescaled, stimescaled,
+ &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+ if (utimescaled)
+ *utimescaled += cputime_to_scaled(udelta);
+ if (stimescaled)
+ *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- /*
- * May be NULL if the underlying cgroup isn't fully-created yet
- */
- if (!tg->css.cgroup) {
- group_path[0] = '\0';
- return group_path;
- }
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
return group_path;
}
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
{
unsigned int freq = cpu_khz ? : 1;
- SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+ SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
#else
- SEQ_printf(m, "\ncpu#%d\n", cpu);
+ SEQ_printf(m, "cpu#%d\n", cpu);
#endif
#define P(x) \
@@ -330,6 +323,7 @@ do { \
print_rq(m, rq, cpu);
rcu_read_unlock();
spin_unlock_irqrestore(&sched_debug_lock, flags);
+ SEQ_printf(m, "\n");
}
static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
"linear"
};
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
{
u64 ktime, sched_clk, cpu_clk;
unsigned long flags;
- int cpu;
local_irq_save(flags);
ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
#undef PN
#undef P
- SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+ SEQ_printf(m, " .%-40s: %d (%s)\n",
+ "sysctl_sched_tunable_scaling",
sysctl_sched_tunable_scaling,
sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+ SEQ_printf(m, "\n");
+}
- for_each_online_cpu(cpu)
- print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+ int cpu = (unsigned long)(v - 2);
- SEQ_printf(m, "\n");
+ if (cpu != -1)
+ print_cpu(m, cpu);
+ else
+ sched_debug_header(m);
return 0;
}
void sysrq_sched_debug_show(void)
{
- sched_debug_show(NULL, NULL);
+ int cpu;
+
+ sched_debug_header(NULL);
+ for_each_online_cpu(cpu)
+ print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+ unsigned long n = *offset;
+
+ if (n == 0)
+ return (void *) 1;
+
+ n--;
+
+ if (n > 0)
+ n = cpumask_next(n - 1, cpu_online_mask);
+ else
+ n = cpumask_first(cpu_online_mask);
+
+ *offset = n + 1;
+
+ if (n < nr_cpu_ids)
+ return (void *)(unsigned long)(n + 2);
+ return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+ .start = sched_debug_start,
+ .next = sched_debug_next,
+ .stop = sched_debug_stop,
+ .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+ seq_release(inode, file);
+
+ return 0;
}
static int sched_debug_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, sched_debug_show, NULL);
+ int ret = 0;
+
+ ret = seq_open(filp, &sched_debug_sops);
+
+ return ret;
}
static const struct file_operations sched_debug_fops = {
.open = sched_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = sched_debug_release,
};
static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa53643409..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
+ s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
- min_vruntime = vruntime;
+ max_vruntime = vruntime;
- return min_vruntime;
+ return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
vruntime = min_vruntime(vruntime, se->vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
/*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -1680,9 +1702,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}
/* ensure we never gain time by being placed backwards. */
- vruntime = max_vruntime(se->vruntime, vruntime);
-
- se->vruntime = vruntime;
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3254,25 +3274,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
static int select_idle_sibling(struct task_struct *p, int target)
{
- int cpu = smp_processor_id();
- int prev_cpu = task_cpu(p);
struct sched_domain *sd;
struct sched_group *sg;
- int i;
+ int i = task_cpu(p);
- /*
- * If the task is going to be woken-up on this cpu and if it is
- * already idle, then it is the right target.
- */
- if (target == cpu && idle_cpu(cpu))
- return cpu;
+ if (idle_cpu(target))
+ return target;
/*
- * If the task is going to be woken-up on the cpu where it previously
- * ran and if it is currently idle, then it the right target.
+ * If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (target == prev_cpu && idle_cpu(prev_cpu))
- return prev_cpu;
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3299,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
goto next;
for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
+ if (i == target || !idle_cpu(i))
goto next;
}
@@ -3883,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) running (obviously), or
+ * 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
*/
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
- int new_dst_cpu;
+ int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3903,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
- new_dst_cpu = cpumask_first_and(env->dst_grpmask,
- tsk_cpus_allowed(p));
- if (new_dst_cpu < nr_cpu_ids) {
- env->flags |= LBF_SOME_PINNED;
- env->new_dst_cpu = new_dst_cpu;
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
}
+
return 0;
}
@@ -3929,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
+
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-#endif
+
return 1;
}
- if (tsk_cache_hot) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- return 0;
- }
- return 1;
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
}
/*
@@ -3957,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
struct task_struct *p, *n;
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
- if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
- continue;
-
if (!can_migrate_task(p, env))
continue;
@@ -4011,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
break;
}
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (!can_migrate_task(p, env))
goto next;
load = task_h_load(p);
@@ -4022,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
- if (!can_migrate_task(p, env))
- goto next;
-
move_task(p, env);
pulled++;
env->imbalance -= load;
@@ -4254,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_POWER_SCALE;
}
@@ -4264,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
return default_scale_freq_power(sd, cpu);
}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -4279,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
return default_scale_smt_power(sd, cpu);
}
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
@@ -4969,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static int need_active_balance(struct lb_env *env)
{
@@ -5000,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
- int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+ struct cpumask *cpus = __get_cpu_var(load_balance_mask);
struct lb_env env = {
.sd = sd,
@@ -5016,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.cpus = cpus,
};
+ /*
+ * For NEWLY_IDLE load_balancing, we don't need to consider
+ * other cpus in our group
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
+
cpumask_copy(cpus, cpu_active_mask);
- max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
@@ -5043,7 +5059,6 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
- lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -5070,17 +5085,17 @@ more_balance:
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
- if (env.flags & LBF_NEED_BREAK) {
- env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
- }
-
/*
* some other cpu did the load balance for us.
*/
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
resched_cpu(env.dst_cpu);
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
+
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
@@ -5100,14 +5115,17 @@ more_balance:
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
- lb_iterations++ < max_lb_iterations) {
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
@@ -5228,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
- update_rq_runnable_avg(this_rq, 1);
-
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
@@ -5339,7 +5355,7 @@ out_unlock:
return 0;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5404,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ for (; sd; sd = sd->parent)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5419,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ for (; sd; sd = sd->parent)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5477,7 +5499,7 @@ void update_max_interval(void)
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
@@ -5515,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
- * We've pulled tasks over so either we're no
- * longer idle.
+ * The LBF_SOME_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
*/
- idle = CPU_NOT_IDLE;
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
@@ -5549,9 +5572,9 @@ out:
rq->next_balance = next_balance;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5694,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
@@ -6101,7 +6124,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
@@ -6164,7 +6187,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
* Decrement CPU power based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
+}
+
+static void post_schedule_idle(struct rq *rq)
+{
+ idle_enter_fair(rq);
+}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+ /* Trigger the post schedule to do an idle_enter for CFS */
+ rq->post_schedule = 1;
+#endif
return rq->idle;
}
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
+ .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b2847357..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
#include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
+
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
return;
delta_exec = rq->clock_task - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
- (p->nr_cpus_allowed > 1))
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
return 1;
return 0;
}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (p->on_rq && !rq->rt.rt_nr_running)
- pull_rt_task(rq);
+ if (!p->on_rq || rq->rt.rt_nr_running)
+ return;
+
+ if (pull_rt_task(rq))
+ resched_task(rq->curr);
}
void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
if (soft != RLIM_INFINITY) {
unsigned long next;
- p->rt.timeout++;
+ if (p->rt.watchdog_stamp != jiffies) {
+ p->rt.timeout++;
+ p->rt.watchdog_stamp = jiffies;
+ }
+
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
if (p->rt.timeout > next)
p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
if (--p->rt.time_slice)
return;
- p->rt.time_slice = RR_TIMESLICE;
+ p->rt.time_slice = sched_rr_timeslice;
/*
* Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
* Time slice is 0 for SCHED_FIFO tasks
*/
if (task->policy == SCHED_RR)
- return RR_TIMESLICE;
+ return sched_rr_timeslice;
else
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,10 +1,14 @@
#include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/tick.h>
#include "cpupri.h"
+#include "cpuacct.h"
extern __read_mostly int scheduler_running;
@@ -31,6 +35,31 @@ extern __read_mostly int scheduler_running;
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
+# define SCHED_LOAD_RESOLUTION 10
+# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION 0
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -152,11 +181,6 @@ struct task_group {
#define MAX_SHARES (1UL << 18)
#endif
-/* Default task group.
- * Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
-
typedef int (*tg_visitor)(struct task_group *, void *);
extern int walk_tg_tree_from(struct task_group *from,
@@ -194,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
@@ -370,10 +406,13 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
@@ -545,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_id);
+struct sched_group_power {
+ atomic_t ref;
+ /*
+ * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+ * single CPU.
+ */
+ unsigned int power, power_orig;
+ unsigned long next_update;
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_power *sgp;
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
extern int group_balance_cpu(struct sched_group *sg);
#endif /* CONFIG_SMP */
@@ -782,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -854,14 +955,61 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
- CPUACCT_STAT_USER, /* ... user mode */
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
- CPUACCT_STAT_NSTATS,
-};
+#define DEQUEUE_SLEEP 1
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+ struct task_struct * (*pick_next_task) (struct rq *rq);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+ void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ void (*post_schedule) (struct rq *this_rq