diff options
Diffstat (limited to 'kernel')
169 files changed, 16075 insertions, 8943 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f437..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@ config_data.h config_data.gz timeconst.h +hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6c072b6da239..271fd3119af9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o \ async.o range.o groups.o lglock.o smpboot.o @@ -24,10 +24,9 @@ endif obj-y += sched/ obj-y += power/ +obj-y += cpu/ -ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) -obj-$(CONFIG_X86) += kcmp.o -endif +obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -127,11 +126,19 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(obj)/time.o: $(obj)/timeconst.h -quiet_cmd_timeconst = TIMEC $@ - cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + targets += timeconst.h -$(obj)/timeconst.h: $(src)/timeconst.pl FORCE - $(call if_changed,timeconst) +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) # @@ -153,23 +160,7 @@ kernel/modsign_certificate.o: signing_key.x509 extra_certificates # fail and that the kernel may be used afterwards. # ############################################################################### -sign_key_with_hash := -ifeq ($(CONFIG_MODULE_SIG_SHA1),y) -sign_key_with_hash := -sha1 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA224),y) -sign_key_with_hash := -sha224 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA256),y) -sign_key_with_hash := -sha256 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA384),y) -sign_key_with_hash := -sha384 -endif -ifeq ($(CONFIG_MODULE_SIG_SHA512),y) -sign_key_with_hash := -sha512 -endif -ifeq ($(sign_key_with_hash),) +ifndef CONFIG_MODULE_SIG_HASH $(error Could not determine digest type to use from kernel config) endif @@ -182,10 +173,10 @@ signing_key.priv signing_key.x509: x509.genkey @echo "### needs to be run as root, and uses a hardware random" @echo "### number generator if one is available." @echo "###" - openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ - -x509 -config x509.genkey \ + openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ + -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ - -keyout signing_key.priv + -keyout signing_key.priv 2>&1 @echo "###" @echo "### Key pair generated." @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index 051e071a06e7..8d6e145138bb 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname) if (IS_ERR(file)) return PTR_ERR(file); - if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + if (!S_ISREG(file_inode(file)->i_mode)) { filp_close(file, NULL); return -EACCES; } @@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_swaps = encode_comp_t(0); /* + * Get freeze protection. If the fs is frozen, just skip the write + * as we could deadlock the system otherwise. + */ + if (!file_start_write_trylock(file)) + goto out; + /* * Kernel segment override to datasegment and write it * to the accounting file. */ @@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); + file_end_write(file); out: revert_creds(orig_cred); } @@ -566,6 +573,7 @@ out: void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; + cputime_t utime, stime; unsigned long vsize = 0; if (group_dead && current->mm) { @@ -593,8 +601,9 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime += current->utime; - pacct->ac_stime += current->stime; + task_cputime(current, &utime, &stime); + pacct->ac_utime += utime; + pacct->ac_stime += stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/async.c b/kernel/async.c index 6f34904a0b53..61f023ce0228 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -57,65 +57,48 @@ asynchronous and synchronous parts of the kernel. #include <linux/slab.h> #include <linux/workqueue.h> +#include "workqueue_internal.h" + static async_cookie_t next_cookie = 1; -#define MAX_WORK 32768 +#define MAX_WORK 32768 +#define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ -static LIST_HEAD(async_pending); -static ASYNC_DOMAIN(async_running); -static LIST_HEAD(async_domains); +static LIST_HEAD(async_global_pending); /* pending from all registered doms */ +static ASYNC_DOMAIN(async_dfl_domain); static DEFINE_SPINLOCK(async_lock); -static DEFINE_MUTEX(async_register_mutex); struct async_entry { - struct list_head list; + struct list_head domain_list; + struct list_head global_list; struct work_struct work; async_cookie_t cookie; - async_func_ptr *func; + async_func_t func; void *data; - struct async_domain *running; + struct async_domain *domain; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; - -/* - * MUST be called with the lock held! - */ -static async_cookie_t __lowest_in_progress(struct async_domain *running) +static async_cookie_t lowest_in_progress(struct async_domain *domain) { - async_cookie_t first_running = next_cookie; /* infinity value */ - async_cookie_t first_pending = next_cookie; /* ditto */ - struct async_entry *entry; - - /* - * Both running and pending lists are sorted but not disjoint. - * Take the first cookies from both and return the min. - */ - if (!list_empty(&running->domain)) { - entry = list_first_entry(&running->domain, typeof(*entry), list); - first_running = entry->cookie; - } + struct list_head *pending; + async_cookie_t ret = ASYNC_COOKIE_MAX; + unsigned long flags; - list_for_each_entry(entry, &async_pending, list) { - if (entry->running == running) { - first_pending = entry->cookie; - break; - } - } + spin_lock_irqsave(&async_lock, flags); - return min(first_running, first_pending); -} + if (domain) + pending = &domain->pending; + else + pending = &async_global_pending; -static async_cookie_t lowest_in_progress(struct async_domain *running) -{ - unsigned long flags; - async_cookie_t ret; + if (!list_empty(pending)) + ret = list_first_entry(pending, struct async_entry, + domain_list)->cookie; - spin_lock_irqsave(&async_lock, flags); - ret = __lowest_in_progress(running); spin_unlock_irqrestore(&async_lock, flags); return ret; } @@ -127,20 +110,10 @@ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); - struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; - struct async_domain *running = entry->running; - /* 1) move self to the running queue, make sure it stays sorted */ - spin_lock_irqsave(&async_lock, flags); - list_for_each_entry_reverse(pos, &running->domain, list) - if (entry->cookie < pos->cookie) - break; - list_move_tail(&entry->list, &pos->list); - spin_unlock_irqrestore(&async_lock, flags); - - /* 2) run (and print duration) */ + /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "calling %lli_%pF @ %i\n", (long long)entry->cookie, @@ -157,23 +130,22 @@ static void async_run_entry_fn(struct work_struct *work) (long long)ktime_to_ns(delta) >> 10); } - /* 3) remove self from the running queue */ + /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); - list_del(&entry->list); - if (running->registered && --running->count == 0) - list_del_init(&running->node); + list_del_init(&entry->domain_list); + list_del_init(&entry->global_list); - /* 4) free the entry */ + /* 3) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 5) wake up any waiters */ + /* 4) wake up any waiters */ wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) +static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -193,19 +165,25 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_unlock_irqrestore(&async_lock, flags); /* low on memory.. run synchronously */ - ptr(data, newcookie); + func(data, newcookie); return newcookie; } + INIT_LIST_HEAD(&entry->domain_list); + INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = ptr; + entry->func = func; entry->data = data; - entry->running = running; + entry->domain = domain; spin_lock_irqsave(&async_lock, flags); + + /* allocate cookie and queue */ newcookie = entry->cookie = next_cookie++; - list_add_tail(&entry->list, &async_pending); - if (running->registered && running->count++ == 0) - list_add_tail(&running->node, &async_domains); + + list_add_tail(&entry->domain_list, &domain->pending); + if (domain->registered) + list_add_tail(&entry->global_list, &async_global_pending); + atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); @@ -220,34 +198,34 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a /** * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +async_cookie_t async_schedule(async_func_t func, void *data) { - return __async_schedule(ptr, data, &async_running); + return __async_schedule(func, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function - * @running: running list for the domain + * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. - * @running may be used in the async_synchronize_*_domain() functions - * to wait within a certain synchronization domain rather than globally. - * A synchronization domain is specified via the running queue @running to use. - * Note: This function may be called from atomic or non-atomic contexts. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. A + * synchronization domain is specified via @domain. Note: This function + * may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct async_domain *running) +async_cookie_t async_schedule_domain(async_func_t func, void *data, + struct async_domain *domain) { - return __async_schedule(ptr, data, running); + return __async_schedule(func, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); @@ -258,18 +236,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { - mutex_lock(&async_register_mutex); - do { - struct async_domain *domain = NULL; - - spin_lock_irq(&async_lock); - if (!list_empty(&async_domains)) - domain = list_first_entry(&async_domains, typeof(*domain), node); - spin_unlock_irq(&async_lock); - - async_synchronize_cookie_domain(next_cookie, domain); - } while (!list_empty(&async_domains)); - mutex_unlock(&async_register_mutex); + async_synchronize_full_domain(NULL); } EXPORT_SYMBOL_GPL(async_synchronize_full); @@ -284,51 +251,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); */ void async_unregister_domain(struct async_domain *domain) { - mutex_lock(&async_register_mutex); spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->node) || - !list_empty(&domain->domain)); + WARN_ON(!domain->registered || !list_empty(&domain->pending)); domain->registered = 0; spin_unlock_irq(&async_lock); - mutex_unlock(&async_register_mutex); } EXPORT_SYMBOL_GPL(async_unregister_domain); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @domain: running list to synchronize on + * @domain: the domain to synchronize * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @domain have been done. + * synchronization domain specified by @domain have been done. */ void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, domain); + async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on + * @domain: the domain to synchronize (%NULL for all registered domains) * * This function waits until all asynchronous function calls for the - * synchronization domain specified by running list @running submitted - * prior to @cookie have been done. + * synchronization domain specified by @domain submitted prior to @cookie + * have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { ktime_t uninitialized_var(starttime), delta, endtime; - if (!running) - return; - if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } - wait_event(async_done, lowest_in_progress(running) >= cookie); + wait_event(async_done, lowest_in_progress(domain) >= cookie); if (initcall_debug && system_state == SYSTEM_BOOTING) { endtime = ktime_get(); @@ -350,6 +311,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); */ void async_synchronize_cookie(async_cookie_t cookie) { - async_synchronize_cookie_domain(cookie, &async_running); + async_synchronize_cookie_domain(cookie, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); + +/** + * current_is_async - is %current an async worker task? + * + * Returns %true if %current is an async worker task. + */ +bool current_is_async(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker->current_func == async_run_entry_fn; +} diff --git a/kernel/audit.c b/kernel/audit.c index 5c7e62ff4795..21c7fa615bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,7 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -646,14 +646,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* As soon as there's any sign of userspace auditd, * start kauditd to talk to it */ - if (!kauditd_task) + if (!kauditd_task) { kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } } - seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -859,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; /* - * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; @@ -868,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - while (NLMSG_OK(nlh, len)) { + while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - nlh = NLMSG_NEXT(nlh, len); + nlh = nlmsg_next(nlh, &len); } } @@ -1667,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); diff --git a/kernel/audit.h b/kernel/audit.h index 45c8325de5bb..1c95131ef760 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -205,8 +205,6 @@ struct audit_context { #endif }; -#ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, @@ -218,7 +216,6 @@ extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); extern void audit_log_name(struct audit_context *context, struct audit_names *n, struct path *path, int record_num, int *call_panic); -#endif extern int audit_pid; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d6..a291aa23fb3f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void) } spin_unlock(&hash_lock); trim_marked(tree); - put_tree(tree); drop_collected_mounts(root_mnt); skip_it: + put_tree(tree); mutex_lock(&audit_filter_mutex); } list_del(&cursor); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bc6595fe952e..83a2970295d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -533,6 +533,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } diff --git a/kernel/capability.c b/kernel/capability.c index 493d97259484..f6c2ce5701e1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap) EXPORT_SYMBOL(ns_capable); /** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file: The file we want to check + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ + if (WARN_ON_ONCE(!cap_valid(cap))) + return false; + + if (security_capable(file->f_cred, ns, cap) == 0) + return true; + + return false; +} +EXPORT_SYMBOL(file_ns_capable); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4855892798fd..2a9926275f80 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> @@ -52,14 +51,14 @@ #include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> -#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> #include <linux/atomic.h> @@ -83,7 +82,13 @@ * B happens only through cgroup_show_options() and using cgroup_root_mutex * breaks it. */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +#else static DEFINE_MUTEX(cgroup_mutex); +#endif + static DEFINE_MUTEX(cgroup_root_mutex); /* @@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - /* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a @@ -162,6 +117,9 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + + /* file xattrs */ + struct simple_xattrs xattrs; }; /* @@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - static int css_unbias_refcnt(int refcnt) { return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; @@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) return test_bit(CGRP_REMOVED, &cgrp->flags); } -/* bits in struct cgroupfs_root flags field */ -enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ - ROOT_XATTR, /* supports extended attributes */ -}; +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ + while (cgrp) { + if (cgrp == ancestor) + return true; + cgrp = cgrp->parent; + } + return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return __d_cfe(dentry)->type; } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the mutex should be later unlocked. On + * failure returns false with no lock held. + */ +static bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -376,22 +353,18 @@ static int css_set_count; * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 -#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) -static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; +static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { int i; - int index; - unsigned long tmp = 0UL; + unsigned long key = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) - tmp += (unsigned long)css[i]; - tmp = (tmp >> 16) ^ tmp; + key += (unsigned long)css[i]; + key = (key >> 16) ^ key; - index = hash_long(tmp, CSS_SET_HASH_BITS); - - return &css_set_table[index]; + return key; } /* We don't maintain the lists running through each css_set to its @@ -418,7 +391,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) } /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); + hash_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, @@ -426,12 +399,20 @@ static void __put_css_set(struct css_set *cg, int taskexit) struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); + + /* + * We may not be holding cgroup_mutex, and if cgrp->count is + * dropped to 0 the cgroup can be destroyed at any time, hence + * rcu_read_lock is used to keep it alive. + */ + rcu_read_lock(); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + rcu_read_unlock(); kfree(link); } @@ -550,9 +531,8 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct hlist_head *hhead; - struct hlist_node *node; struct css_set *cg; + unsigned long key; /* * Build the set of subsystem state objects that we want to see in the @@ -572,8 +552,8 @@ static struct css_set *find_existing_css_set( } } - hhead = css_set_hash(template); - hlist_for_each_entry(cg, node, hhead, hlist) { + key = css_set_hash(template); + hash_for_each_possible(css_set_table, cg, hlist, key) { if (!compare_css_sets(cg, oldcg, cgrp, template)) continue; @@ -657,8 +637,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; - struct hlist_head *hhead; struct cg_cgroup_link *link; + unsigned long key; /* First see if we already have a cgroup group that matches * the desired set */ @@ -704,8 +684,8 @@ static struct css_set *find_css_set( css_set_count++; /* Add this cgroup group to the hash table */ - hhead = css_set_hash(res->subsys); - hlist_add_head(&res->hlist, hhead); + key = css_set_hash(res->subsys); + hash_add(css_set_table, &res->hlist, key); write_unlock(&css_set_lock); @@ -797,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_lock); - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} -EXPORT_SYMBOL_GPL(cgroup_unlock); - /* * A couple of forward declarations required, due to cyclic reference loop: * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -856,57 +815,84 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - struct cgroup_subsys *ss; - BUG_ON(!(cgroup_is_removed(cgrp))); - /* It's possible for external users to be holding css - * reference counts on a cgroup; css_put() needs to - * be able to access the cgroup after decrementing - * the reference count in order to know if it needs to - * queue the cgroup to be handled by the release - * agent */ - synchronize_rcu(); + struct cgroup_name *name; - mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_subsys(cgrp->root, ss) - ss->css_free(cgrp); + name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name->name, dentry->d_name.name); + return name; +} - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); +static void cgroup_free_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, free_work); + struct cgroup_subsys *ss; - /* - * Drop the active superblock reference that we took when we - * created the cgroup - */ - deactivate_super(cgrp->root->sb); + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->css_free(cgrp); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); + + /* + * We get a ref to the parent's dentry, and put the ref when + * this cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + dput(cgrp->parent->dentry); + + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + + /* + * Drop the active superblock reference that we took when we + * created the cgroup. This will free cgrp->root, if we are + * holding the last reference to @sb. + */ + deactivate_super(cgrp->root->sb); + + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + + simple_xattrs_free(&cgrp->xattrs); - simple_xattrs_free(&cgrp->xattrs); + kfree(rcu_dereference_raw(cgrp->name)); + kfree(cgrp); +} + +static void cgroup_free_rcu(struct rcu_head *head) +{ + struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); + + schedule_work(&cgrp->free_work); +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - kfree_rcu(cgrp, rcu_head); + BUG_ON(!(cgroup_is_removed(cgrp))); + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; - struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); + simple_xattrs_free(&cfe->xattrs); kfree(cfe); - simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -925,13 +911,17 @@ static void remove_dir(struct dentry *d) dput(parent); } -static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { struct cfent *cfe; lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); + /* + * If we're doing cleanup due to failure of cgroup_create(), + * the corresponding @cfe may not exist. + */ list_for_each_entry(cfe, &cgrp->files, node) { struct dentry *d = cfe->dentry; @@ -944,9 +934,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) list_del_init(&cfe->node); dput(d); - return 0; + break; } - return -ENOENT; } /** @@ -1083,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; - synchronize_rcu(); return 0; } @@ -1096,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) + seq_puts(seq, ",sane_behavior"); + if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); - if (test_bit(ROOT_XATTR, &root->flags)) + if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1160,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } + if (!strcmp(token, "__DEVEL__sane_behavior")) { + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; + continue; + } if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); + opts->flags |= CGRP_ROOT_NOPREFIX; continue; } if (!strcmp(token, "clone_children")) { @@ -1169,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "xattr")) { - set_bit(ROOT_XATTR, &opts->flags); + opts->flags |= CGRP_ROOT_XATTR; continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1247,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + + if (opts->flags & CGRP_ROOT_NOPREFIX) { + pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + return -EINVAL; + } + + if (opts->cpuset_clone_children) { + pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + return -EINVAL; + } + } + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_mask & mask)) + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; @@ -1324,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: remount is not allowed\n"); + return -EINVAL; + } + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1393,6 +1405,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); + INIT_WORK(&cgrp->free_work, cgroup_free_fn); mutex_init(&cgrp->pidlist_mutex); INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); @@ -1408,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; - cgrp->top_cgroup = cgrp; + cgrp->name = &root_cgroup_name; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); } @@ -1597,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct cgroupfs_root *existing_root; const struct cred *cred; int i; + struct css_set *cg; BUG_ON(sb->s_root != NULL); @@ -1650,14 +1664,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct hlist_head *hhead = &css_set_table[i]; - struct hlist_node *node; - struct css_set *cg; - - hlist_for_each_entry(cg, node, hhead, hlist) - link_css_set(&tmp_cg_links, cg, root_cgrp); - } + hash_for_each(css_set_table, i, cg, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1677,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && + root->flags != opts.flags) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } + /* no subsys rebinding, so refcounts don't change */ drop_parsed_module_refcounts(opts.subsys_mask); } @@ -1761,49 +1777,48 @@ static struct kobject *cgroup_kobj; * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf. Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { - struct dentry *dentry = cgrp->dentry; + int ret = -ENAMETOOLONG; char *start; - rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), - "cgroup_path() called without proper locking"); - - if (!dentry || cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); + if (!cgrp->parent) { + if (strlcpy(buf, "/", buflen) >= buflen) + return -ENAMETOOLONG; return 0; } start = buf + buflen - 1; - *start = '\0'; - for (;;) { - int len = dentry->d_name.len; + rcu_read_lock(); + do { + const char *name = cgroup_name(cgrp); + int len; + + len = strlen(name); if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; + goto out; + memcpy(start, name, len); - dentry = cgrp->dentry; - if (!cgrp->parent) - continue; if (--start < buf) - return -ENAMETOOLONG; + goto out; *start = '/'; - } + + cgrp = cgrp->parent; + } while (cgrp->parent); + ret = 0; memmove(buf, start, buf + buflen - start); - return 0; +out: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(cgroup_path); @@ -1892,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; @@ -1925,122 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, } /** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - struct css_set *newcg; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - newcg = find_css_set(tsk->cgroups, cgrp); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(cgrp, &tset); - } - - synchronize_rcu(); -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); - } - } - return retval; -} - -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - -/** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ - struct task_struct *tsk; + struct task_struct *leader = tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -2052,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * group - group_rwsem prevents new threads from appearing, and if * threads exit, this will just be an over-estimate. */ - group_size = get_nr_threads(leader); + if (threadgroup) + group_size = get_nr_threads(tsk); + else + group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); if (retval) goto out_free_group_list; - tsk = leader; i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2091,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; + + if (!threadgroup) + break; } while_each_thread(leader, tsk); rcu_read_unlock(); /* remember the number of threads in the array for later. */ @@ -2136,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2151,7 +2071,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* * step 5: success! and cleanup */ - synchronize_rcu(); retval = 0; out_put_css_set_refs: if (retval) { @@ -2218,11 +2137,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; @@ -2245,17 +2164,42 @@ retry_find_task: put_task_struct(tsk); goto retry_find_task; } - ret = cgroup_attach_proc(cgrp, tsk); - } else - ret = cgroup_attach_task(cgrp, tsk); + } + + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + threadgroup_unlock(tsk); put_task_struct(tsk); out_unlock_cgroup: - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return ret; } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + mutex_lock(&cgroup_mutex); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk, false); + if (retval) + break; + } + mutex_unlock(&cgroup_mutex); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { return attach_task_by_pid(cgrp, pid, false); @@ -2266,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) return attach_task_by_pid(cgrp, tgid, true); } -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); - static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { @@ -2295,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } @@ -2306,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } @@ -2531,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int ret; + struct cgroup_name *name, *old_name; + struct cgroup *cgrp; + + /* + * It's convinient to use parent dir's i_mutex to protected + * cgrp->name. + */ + lockdep_assert_held(&old_dir->i_mutex); + if (!S_ISDIR(old_dentry->d_inode->i_mode)) return -ENOTDIR; if (new_dentry->d_inode) return -EEXIST; if (old_dir != new_dir) return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + + cgrp = __d_cgrp(old_dentry); + + name = cgroup_alloc_name(new_dentry); + if (!name) + return -ENOMEM; + + ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + kfree(name); + return ret; + } + + old_name = cgrp->name; + rcu_assign_pointer(cgrp->name, name); + + kfree_rcu(old_name, rcu_head); + return 0; } static struct simple_xattrs *__d_xattrs(struct dentry *dentry) @@ -2545,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) if (S_ISDIR(dentry->d_inode->i_mode)) return &__d_cgrp(dentry)->xattrs; else - return &__d_cft(dentry)->xattrs; + return &__d_cfe(dentry)->xattrs; } static inline int xattr_enabled(struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return test_bit(ROOT_XATTR, &root->flags); + return root->flags & CGRP_ROOT_XATTR; } static bool is_valid_xattr(const char *name) @@ -2637,7 +2597,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un */ static inline struct cftype *__file_cft(struct file *file) { - if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + if (file_inode(file)->i_fop != &cgroup_file_operations) return ERR_PTR(-EINVAL); return __d_cft(file->f_dentry); } @@ -2721,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - simple_xattrs_init(&cft->xattrs); - - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, subsys->name); strcat(name, "."); } @@ -2747,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, cfe->type = (void *)cft; cfe->dentry = dentry; dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } @@ -2764,19 +2723,21 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) continue; - if (is_add) + if (is_add) { err = cgroup_add_file(cgrp, subsys, cft); - else - err = cgroup_rm_file(cgrp, cft); - if (err) { - pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", - is_add ? "add" : "remove", cft->name, err); + if (err) + pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", + cft->name, err); ret = err; + } else { + cgroup_rm_file(cgrp, cft); } } return ret; @@ -3017,6 +2978,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, } EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +/** + * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup + * @pos: cgroup of interest + * + * Return the rightmost descendant of @pos. If there's no descendant, + * @pos is returned. This can be used during pre-order traversal to skip + * subtree of @pos. + */ +struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +{ + struct cgroup *last, *tmp; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + last = pos; + /* ->prev isn't RCU safe, walk ->next till the end */ + pos = NULL; + list_for_each_entry_rcu(tmp, &last->children, sibling) + pos = tmp; + } while (pos); + + return last; +} +EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); + static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) { struct cgroup *last; @@ -3268,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } +static void cgroup_transfer_one_task(struct task_struct *task, + struct cgroup_scanner *scan) +{ + struct cgroup *new_cgroup = scan->data; + + mutex_lock(&cgroup_mutex); + cgroup_attach_task(new_cgroup, task, false); + mutex_unlock(&cgroup_mutex); +} + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + struct cgroup_scanner scan; + + scan.cg = from; + scan.test_task = NULL; /* select all tasks in cgroup */ + scan.process_task = cgroup_transfer_one_task; + scan.heap = NULL; + scan.data = to; + + return cgroup_scan_tasks(&scan); +} + /* * Stuff for reading the 'tasks'/'procs' files. * @@ -3330,35 +3345,14 @@ static void pidlist_free(void *p) else kfree(p); } -static void *pidlist_resize(void *p, int newcount) -{ - void *newlist; - /* note: if new alloc fails, old p will still be valid either way */ - if (is_vmalloc_addr(p)) { - newlist = vmalloc(newcount * sizeof(pid_t)); - if (!newlist) - return NULL; - memcpy(newlist, p, newcount * sizeof(pid_t)); - vfree(p); - } else { - newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); - } - return newlist; -} /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. + * Returns the number of unique elements. */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) +static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; - pid_t *list = *p; - pid_t *newlist; /* * we presume the 0th element is unique, so i starts at 1. trivial @@ -3379,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length) dest++; } after: - /* - * if the length difference is large enough, we want to allocate a - * smaller buffer to save memory. if this fails due to out of memory, - * we'll just stay with what we've got. - */ - if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = pidlist_resize(list, dest); - if (newlist) - *p = newlist; - } return dest; } @@ -3484,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(&array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find(cgrp, type); if (!l) { pidlist_free(array); @@ -3752,8 +3736,13 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; + remove_wait_queue(event->wqh, &event->wait); + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + eventfd_ctx_put(event->eventfd); kfree(event); dput(cgrp->dentry); @@ -3773,15 +3762,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { - __remove_wait_queue(event->wqh, &event->wait); - spin_lock(&cgrp->event_list_lock); - list_del_init(&event->list); - spin_unlock(&cgrp->event_list_lock); /* - * We are in atomic context, but cgroup_event_remove() may - * sleep, so we have to call it in workqueue. + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. */ - schedule_work(&event->remove); + spin_lock(&cgrp->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); } return 0; @@ -3807,6 +3806,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { struct cgroup_event *event = NULL; + struct cgroup *cgrp_cfile; unsigned int efd, cfd; struct file *efile = NULL; struct file *cfile = NULL; @@ -3852,7 +3852,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); + ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) goto fail; @@ -3862,6 +3862,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * The file to be monitored must be in the same cgroup as + * cgroup.event_control is. + */ + cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); + if (cgrp_cfile != cgrp) { + ret = -EINVAL; + goto fail; + } + if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto fail; @@ -3872,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, if (ret) goto fail; - if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { - event->cft->unregister_event(cgrp, event->cft, event->eventfd); - ret = 0; - goto fail; - } + efile->f_op->poll(efile, &event->pt); /* * Events should be removed after rmdir of cgroup directory, but before @@ -3958,10 +3964,16 @@ static struct cftype files[] = { }, { .name = "cgroup.clone_children", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cgroup_sane_behavior_show, + }, + { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .read_seq_string = cgroup_release_agent_show, @@ -4073,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!(css->flags & CSS_ONLINE)) return; - /* - * css_offline() should be called with cgroup_mutex unlocked. See - * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for - * details. This temporary unlocking should go away once - * cgroup_mutex is unexported from controllers. - */ - if (ss->css_offline) { - mutex_unlock(&cgroup_mutex); + if (ss->css_offline) ss->css_offline(cgrp); - mutex_lock(&cgroup_mutex); - } cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; } @@ -4100,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { struct cgroup *cgrp; + struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int err = 0; struct cgroup_subsys *ss; @@ -4110,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + name = cgroup_alloc_name(dentry); + if (!name) + goto err_free_cgrp; + rcu_assign_pointer(cgrp->name, name); + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); if (cgrp->id < 0) - goto err_free_cgrp; + goto err_free_name; /* * Only live parents can have children. Note that the liveliness @@ -4135,9 +4144,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, init_cgroup_housekeeping(cgrp); + dentry->d_fsdata = cgrp; + cgrp->dentry = dentry; + cgrp->parent = parent; cgrp->root = parent->root; - cgrp->top_cgroup = parent->top_cgroup; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4172,8 +4183,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; @@ -4182,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) dget(dentry); + /* hold a ref to the parent's dentry */ + dget(parent->dentry); + /* creation succeeded, notify subsystems */ for_each_subsys(root, ss) { err = online_css(ss, cgrp); @@ -4217,6 +4229,8 @@ err_free_all: deactivate_super(sb); err_free_id: ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_name: + kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: kfree(cgrp); return err; @@ -4236,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -/* - * Check the reference count on each subsystem. Since we already - * established that there are no tasks in the cgroup, if the css refcount - * is also 1, then there should be no outstanding references, so the - * subsystem is safe to destroy. We scan across all subsystems rather than - * using the per-hierarchy linked list of mounted subsystems since we can - * be called via check_for_release() with no synchronization other than - * RCU, and the subsystem linked list isn't RCU-safe. - */ -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ - int i; - - /* - * We won't need to lock the subsys array, because the subsystems - * we're concerned about aren't going anywhere since our cgroup root - * has a reference on them. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - struct cgroup_subsys_state *css; - - /* Skip subsystems not present or not in this hierarchy */ - if (ss == NULL || ss->root != cgrp->root) - continue; - - css = cgrp->subsys[ss->subsys_id]; - /* - * When called from check_for_release() it's possible - * that by this point the cgroup has been removed - * and the css deleted. But a false-positive doesn't - * matter, since it can only happen if the cgroup - * has been deleted and hence no longer needs the - * release agent to be called anyway. - */ - if (css && css_refcnt(css) > 1) - return 1; - } - return 0; -} - static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; struct cgroup *parent = cgrp->parent; - DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - LIST_HEAD(tmp_list); lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4340,20 +4311,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. Use - * a temporary list to avoid a deadlock with cgroup_event_wake(). Since - * cgroup_event_wake() is called with the wait queue head locked, - * remove_wait_queue() cannot be called while holding event_list_lock. + * directory to avoid race between userspace and kernelspace. */ spin_lock(&cgrp->event_list_lock); - list_splice_init(&cgrp->event_list, &tmp_list); - spin_unlock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &tmp_list, list) { + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { list_del_init(&event->list); - remove_wait_queue(event->wqh, &event->wait); - eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } + spin_unlock(&cgrp->event_list_lock); return 0; } @@ -4415,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - ss->active = 1; BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4438,6 +4402,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; int i, ret; + struct hlist_node *tmp; + struct css_set *cg; + unsigned long key; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4503,27 +4470,20 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * this is all done under the css_set_lock. */ write_lock(&css_set_lock); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { - struct css_set *cg; - struct hlist_node *node, *tmp; - struct hlist_head *bucket = &css_set_table[i], *new_bucket; - - hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { - /* skip entries that we already rehashed */ - if (cg->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hlist_del(&cg->hlist); - /* set new value */ - cg->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - new_bucket = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, new_bucket); - } + hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hash_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); - ss->active = 1; ret = online_css(ss, dummytop); if (ret) goto err_unload; @@ -4551,7 +4511,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cg_cgroup_link *link; - struct hlist_head *hhead; BUG_ON(ss->module == NULL); @@ -4565,12 +4524,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); offline_css(ss, dummytop); - ss->active = 0; - if (ss->use_id) { - idr_remove_all(&ss->idr); + if (ss->use_id) idr_destroy(&ss->idr); - } /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; @@ -4585,11 +4541,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_lock(&css_set_lock); list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { struct css_set *cg = link->cg; + unsigned long key; - hlist_del(&cg->hlist); + hash_del(&cg->hlist); cg->subsys[ss->subsys_id] = NULL; - hhead = css_set_hash(cg->subsys); - hlist_add_head(&cg->hlist, hhead); + key = css_set_hash(cg->subsys); + hash_add(css_set_table, &cg->hlist, key); } write_unlock(&css_set_lock); @@ -4631,9 +4588,6 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); - for (i = 0; i < CSS_SET_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -4667,7 +4621,7 @@ int __init cgroup_init(void) { int err; int i; - struct hlist_head *hhead; + unsigned long key; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -4686,8 +4640,8 @@ int __init cgroup_init(void) } /* Add init_css_set to the hash table */ - hhead = css_set_hash(init_css_set.subsys); - hlist_add_head(&init_css_set.hlist, hhead); + key = css_set_hash(init_css_set.subsys); + hash_add(css_set_table, &init_css_set.hlist, key); BUG_ON(!init_root_id(&rootnode)); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); @@ -4724,7 +4678,7 @@ out: */ /* TODO: Use a proper seq_file iterator */ -static int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; @@ -4776,19 +4730,6 @@ out: return retval; } -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { @@ -4890,17 +4831,17 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, and the builtin section of the subsys + * array is immutable, so we don't need to lock the + * subsys array here. On the other hand, modular section + * of the array can be freed at module unload, so we + * can't touch that. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* - * fork/exit callbacks are supported only for - * builtin subsystems and we don't need further - * synchronization as they never go away. - */ - if (!ss || ss->module) - continue; - if (ss->fork) ss->fork(child); } @@ -4965,13 +4906,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, see cgroup_post_fork() for details. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* modular subsystems can't use callbacks */ - if (!ss || ss->module) - continue; - if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -4982,48 +4923,22 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - if (cg) - put_css_set_taskexit(cg); -} - -/** - * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp - * @cgrp: the cgroup in question - * @task: the task in question - * - * See if @cgrp is a descendant of @task's cgroup in the appropriate - * hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) -{ - int ret; - struct cgroup *target; - - if (cgrp == dummytop) - return 1; - - target = task_cgroup_from_root(task, cgrp->root); - while (cgrp != target && cgrp!= cgrp->top_cgroup) - cgrp = cgrp->parent; - ret = (cgrp == target); - return ret; + put_css_set_taskexit(cg); } static void check_for_release(struct cgroup *cgrp) { /* All of these checks rely on RCU to keep the cgroup * structure alive */ - if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { - /* Control Group is currently removeable. If it's not + if (cgroup_is_releasable(cgrp) && + !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + /* + * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue - * it now */ + * it now + */ int need_schedule_work = 0; + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { @@ -5056,24 +4971,11 @@ EXPORT_SYMBOL_GPL(__css_tryget); /* Caller must verify that the css is not for root cgroup */ void __css_put(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = css->cgroup; int v; - rcu_read_lock(); v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - - switch (v) { - case 1: - if (notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - break; - case 0: + if (v == 0) schedule_work(&css->dput_work); - break; - } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(__css_put); @@ -5274,7 +5176,7 @@ EXPORT_SYMBOL_GPL(free_css_id); static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) { struct css_id *newid; - int myid, error, size; + int ret, size; BUG_ON(!ss->use_id); @@ -5282,35 +5184,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) newid = kzalloc(size, GFP_KERNEL); if (!newid) return ERR_PTR(-ENOMEM); - /* get id */ - if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { - error = -ENOMEM; - goto err_out; - } + + idr_preload(GFP_KERNEL); spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ - error = idr_get_new_above(&ss->idr, newid, 1, &myid); + ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); spin_unlock(&ss->id_lock); + idr_preload_end(); /* Returns error when there are no free spaces for new ID.*/ - if (error) { - error = -ENOSPC; + if (ret < 0) goto err_out; - } - if (myid > CSS_ID_MAX) - goto remove_idr; - newid->id = myid; + newid->id = ret; newid->depth = depth; return newid; -remove_idr: - error = -ENOSPC; - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, myid); - spin_unlock(&ss->id_lock); err_out: kfree(newid); - return ERR_PTR(error); + return ERR_PTR(ret); } @@ -5383,55 +5274,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - tmp = idr_get_next(&ss->idr, &tmpid); - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - /* * get corresponding css from file open on cgroupfs directory */ @@ -5441,7 +5283,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; - inode = f->f_dentry->d_inode; + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); diff --git a/kernel/compat.c b/kernel/compat.c index 36700e9e2be9..0a09e481b70b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } -asmlinkage long compat_sys_getitimer(int which, - struct compat_itimerval __user *it) +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) { struct itimerval kit; int error; @@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which, return error; } -asmlinkage long compat_sys_setitimer(int which, - struct compat_itimerval __user *in, - struct compat_itimerval __user *out) +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) { struct itimerval kin, kout; int error; @@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) memcpy(blocked->sig, &set, sizeof(set)); } -asmlinkage long compat_sys_sigprocmask(int how, - compat_old_sigset_t __user *nset, - compat_old_sigset_t __user *oset) +COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, + compat_old_sigset_t __user *, nset, + compat_old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; @@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) -{ - struct rusage r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrusage(who, (struct rusage __user *) &r); - set_fs(old_fs); - - if (ret) - return ret; - - if (put_compat_rusage(&r, ru)) - return -EFAULT; - - return 0; -} - COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, @@ -593,7 +574,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, else ret = put_compat_rusage(&ru, uru); if (ret) - return ret; + return -EFAULT; } BUG_ON(info.si_code & __SI_MASK); @@ -971,7 +952,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, } void -sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) { switch (_NSIG_WORDS) { case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); @@ -982,10 +963,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -asmlinkage long -compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, - struct compat_siginfo __user *uinfo, - struct compat_timespec __user *uts, compat_size_t sigsetsize) +void +sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +{ + switch (_NSIG_WORDS) { + case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; + case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; + case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; + case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + } +} + +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { compat_sigset_t s32; sigset_t s; @@ -1013,18 +1004,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } return ret; - -} - -asmlinkage long -compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, - struct compat_siginfo __user *uinfo) -{ - siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; - return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef __ARCH_WANT_COMPAT_SYS_TIME @@ -1067,23 +1046,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) -{ - sigset_t newset; - compat_sigset_t newset32; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&newset, &newset32); - return sigsuspend(&newset); -} -#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ - asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; @@ -1157,74 +1119,9 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, } #endif -struct compat_sysinfo { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - u16 procs; - u16 pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(u32)-sizeof(int)]; -}; - -asmlinkage long -compat_sys_sysinfo(struct compat_sysinfo __user *info) -{ - struct sysinfo s; - - do_sysinfo(&s); - - /* Check to see if any memory value is too large for 32-bit and scale - * down if needed - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - int bitcount = 0; - - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - - s.totalram >>= bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || - __put_user (s.uptime, &info->uptime) || - __put_user (s.loads[0], &info->loads[0]) || - __put_user (s.loads[1], &info->loads[1]) || - __put_user (s.loads[2], &info->loads[2]) || - __put_user (s.totalram, &info->totalram) || - __put_user (s.freeram, &info->freeram) || - __put_user (s.sharedram, &info->sharedram) || - __put_user (s.bufferram, &info->bufferram) || - __put_user (s.totalswap, &info->totalswap) || - __put_user (s.freeswap, &info->freeswap) || - __put_user (s.procs, &info->procs) || - __put_user (s.totalhigh, &info->totalhigh) || - __put_user (s.freehigh, &info->freehigh) || - __put_user (s.mem_unit, &info->mem_unit)) - return -EFAULT; - - return 0; -} - -#ifdef __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL -asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, - struct compat_timespec __user *interval) +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) { struct timespec t; int ret; @@ -1237,7 +1134,6 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, return -EFAULT; return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_SCHED_RR_GET_INTERVAL */ /* * Allocate user-space memory for the duration of a single system call, diff --git a/kernel/configs.c b/kernel/configs.c index 42e8fa075eed..c18b1f1ae515 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -79,7 +79,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - entry->size = kernel_config_data_size; + proc_set_size(entry, kernel_config_data_size); return 0; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e0e07fd55508..65349f07b878 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,29 +1,41 @@ +/* + * Context tracking: Probe on high level context boundaries such as kernel + * and userspace. This includes syscalls and exceptions entry/exit. + * + * This is used by RCU to remove its dependency on the timer tick while a CPU + * runs in userspace. + * + * Started by Frederic Weisbecker: + * + * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com> + * + * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, + * Steven Rostedt, Peter Zijlstra for suggestions and improvements. + * + */ + #include <linux/context_tracking.h> +#include <linux/kvm_host.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/percpu.h> #include <linux/hardirq.h> +#include <linux/export.h> -struct context_tracking { - /* - * When active is false, hooks are not set to - * minimize overhead: TIF flags are cleared - * and calls to user_enter/exit are ignored. This - * may be further optimized using static keys. - */ - bool active; - enum { - IN_KERNEL = 0, - IN_USER, - } state; -}; - -static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_FORCE .active = true, #endif }; +/** + * user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. + * + * This function must be called right before we switch from the kernel + * to userspace, when it's guaranteed the remaining kernel instructions + * to execute won't use any RCU read side critical section because this + * function sets RCU in extended quiescent state. + */ void user_enter(void) { unsigned long flags; @@ -39,40 +51,90 @@ void user_enter(void) if (in_interrupt()) return; + /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); local_irq_save(flags); if (__this_cpu_read(context_tracking.active) && __this_cpu_read(context_tracking.state) != IN_USER) { - __this_cpu_write(context_tracking.state, IN_USER); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ + vtime_user_enter(current); rcu_user_enter(); + __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); } + +/** + * user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. + * + * This function must be called after we entered the kernel from userspace + * before any use of RCU read side critical section. This potentially include + * any high level kernel code like syscalls, exceptions, signal handling, etc... + * + * This call supports re-entrancy. This way it can be called from any exception + * handler without needing to know if we came from userspace or not. + */ void user_exit(void) { unsigned long flags; - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ if (in_interrupt()) return; local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - __this_cpu_write(context_tracking.state, IN_KERNEL); + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ rcu_user_exit(); + vtime_user_exit(current); + __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); } +void guest_enter(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_enter(current); + else + __guest_enter(); +} +EXPORT_SYMBOL_GPL(guest_enter); + +void guest_exit(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_exit(current); + else + __guest_exit(); +} +EXPORT_SYMBOL_GPL(guest_exit); + + +/** + * context_tracking_task_switch - context switch the syscall callbacks + * @prev: the task that is being switched out + * @next: the task that is being switched in + * + * The context tracking uses the syscall slow path to implement its user-kernel + * boundaries probes on syscalls. This way it doesn't impact the syscall fast + * path on CPUs that don't do context tracking. + * + * But we need to clear the flag on the previous task because it may later + * migrate to some CPU that doesn't do the context tracking. As such the TIF + * flag may not be desired there. + */ void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 3046a503242c..b5e4ab2d427e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu) static inline void check_for_tasks(int cpu) { struct task_struct *p; + cputime_t utime, stime; write_lock_irq(&tasklist_lock); for_each_process(p) { + task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (p->utime || p->stime)) + (utime || stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, @@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Park the stopper thread */ + kthread_park(current); return 0; } diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile new file mode 100644 index 000000000000..59ab052ef7a0 --- /dev/null +++ b/kernel/cpu/Makefile @@ -0,0 +1 @@ +obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c new file mode 100644 index 000000000000..8b86c0c68edf --- /dev/null +++ b/kernel/cpu/idle.c @@ -0,0 +1,116 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/tick.h> +#include <linux/mm.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + cpu_idle_poll(); + } else { + current_clr_polling(); + if (!need_resched()) { + stop_critical_timings(); + rcu_idle_enter(); + arch_cpu_idle(); + WARN_ON_ONCE(irqs_disabled()); + rcu_idle_exit(); + start_critical_timings(); + } else { + local_irq_enable(); + } + current_set_polling(); + } + arch_cpu_idle_exit(); + } + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7bb63eea6eb8..64b3f791bbe5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,14 +61,6 @@ #include <linux/cgroup.h> /* - * Workqueue for cpuset related tasks. - * - * Using kevent workqueue may cause deadlock when memory_migrate - * is set. So we create a separate workqueue thread for cpuset. - */ -static struct workqueue_struct *cpuset_wq; - -/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. @@ -95,18 +87,21 @@ struct cpuset { cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ - struct cpuset *parent; /* my parent */ - struct fmeter fmeter; /* memory_pressure filter */ + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + /* partition number for rebuild_sched_domains() */ int pn; /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset hierarchy */ - struct list_head stack_list; + struct work_struct hotplug_work; }; /* Retrieve the cpuset for a cgroup */ @@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +static inline struct cpuset *parent_cs(const struct cpuset *cs) +{ + struct cgroup *pcgrp = cs->css.cgroup->parent; + + if (pcgrp) + return cgroup_cs(pcgrp); + return NULL; +} + #ifdef CONFIG_NUMA static inline bool task_has_mempolicy(struct task_struct *task) { @@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) /* bits in struct cpuset flags field */ typedef enum { + CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -147,13 +152,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; -/* the type of hotplug event */ -enum hotplug_event { - CPUSET_CPU_OFFLINE, - CPUSET_MEM_OFFLINE, -}; - /* convenient tests for these bits */ +static inline bool is_cpuset_online(const struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags); +} + static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); @@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | + (1 << CS_MEM_EXCLUSIVE)), }; +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_cgrp: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ + cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ + if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_cgrp: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_cgrp by calling + * cgroup_rightmost_descendant() to skip subtree. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ + cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ + if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) + /* - * There are two global mutexes guarding cpuset structures. The first - * is the main control groups cgroup_mutex, accessed via - * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific - * callback_mutex, below. They can nest. It is ok to first take - * cgroup_mutex, then nest callback_mutex. We also require taking - * task_lock() when dereferencing a task's cpuset pointer. See "The - * task_lock() exception", at the end of this comment. - * - * A task must hold both mutexes to modify cpusets. If a task - * holds cgroup_mutex, then it blocks others wanting that mutex, - * ensuring that it is the only task able to also acquire callback_mutex - * and be able to modify cpusets. It can perform various checks on - * the cpuset structure first, knowing nothing will change. It can - * also allocate memory while just holding cgroup_mutex. While it is - * performing these checks, various callback routines can briefly - * acquire callback_mutex to query cpusets. Once it is ready to make - * the changes, it takes callback_mutex, blocking everyone else. + * There are two global mutexes guarding cpuset structures - cpuset_mutex + * and callback_mutex. The latter may nest inside the former. We also + * require taking task_lock() when dereferencing a task's cpuset pointer. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold both mutexes to modify cpusets. If a task holds + * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * is the only task able to also acquire callback_mutex and be able to + * modify cpusets. It can perform various checks on the cpuset structure + * first, knowing nothing will change. It can also allocate memory while + * just holding cpuset_mutex. While it is performing these checks, various + * callback routines can briefly acquire callback_mutex to query cpusets. + * Once it is ready to make the changes, it takes callback_mutex, blocking + * everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex @@ -232,18 +261,19 @@ static struct cpuset top_cpuset = { * guidelines for accessing subsystem state in kernel/cgroup.c */ +static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); /* - * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist - * buffers. They are statically allocated to prevent using excess stack - * when calling cpuset_print_task_mems_allowed(). + * CPU / memory hotplug is handled asynchronously. */ -#define CPUSET_NAME_LEN (128) -#define CPUSET_NODELIST_LEN (256) -static char cpuset_name[CPUSET_NAME_LEN]; -static char cpuset_nodelist[CPUSET_NODELIST_LEN]; -static DEFINE_SPINLOCK(cpuset_buffer_lock); +static struct workqueue_struct *cpuset_propagate_hotplug_wq; + +static void cpuset_hotplug_workfn(struct work_struct *work); +static void cpuset_propagate_hotplug_workfn(struct work_struct *work); +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); + +static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); /* * This is ugly, but preserves the userspace API for existing cpuset @@ -289,7 +319,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, struct cpumask *pmask) { while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) - cs = cs->parent; + cs = parent_cs(cs); if (cs) cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); else @@ -314,7 +344,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) { while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) - cs = cs->parent; + cs = parent_cs(cs); if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); @@ -326,7 +356,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cgroup_mutex held + * Called with callback_mutex/cpuset_mutex held */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -346,7 +376,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cgroup_mutex. + * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -395,7 +425,7 @@ static void free_trial_cpuset(struct cpuset *trial) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cgroup_mutex held. + * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -412,48 +442,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) { struct cgroup *cont; struct cpuset *c, *par; + int ret; + + rcu_read_lock(); /* Each of our child cpusets must be a subset of us */ - list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { - if (!is_cpuset_subset(cgroup_cs(cont), trial)) - return -EBUSY; - } + ret = -EBUSY; + cpuset_for_each_child(c, cont, cur) + if (!is_cpuset_subset(c, trial)) + goto out; /* Remaining checks don't apply to root cpuset */ + ret = 0; if (cur == &top_cpuset) - return 0; + goto out; - par = cur->parent; + par = parent_cs(cur); /* We must be a subset of our parent cpuset */ + ret = -EACCES; if (!is_cpuset_subset(trial, par)) - return -EACCES; + goto out; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ - list_for_each_entry(cont, &par->css.cgroup->children, sibling) { - c = cgroup_cs(cont); + ret = -EINVAL; + cpuset_for_each_child(c, cont, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - return -EINVAL; + goto out; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) - return -EINVAL; + goto out; } - /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ - if (cgroup_task_count(cur->css.cgroup)) { - if (cpumask_empty(trial->cpus_allowed) || - nodes_empty(trial->mems_allowed)) { - return -ENOSPC; - } - } + /* + * Cpusets with tasks - existing or newly being attached - can't + * have empty cpus_allowed or mems_allowed. + */ + ret = -ENOSPC; + if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && + (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed))) + goto out; - return 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SMP @@ -474,31 +514,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) return; } -static void -update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +static void update_domain_attr_tree(struct sched_domain_attr *dattr, + struct cpuset *root_cs) { - LIST_HEAD(q); - - list_add(&c->stack_list, &q); - while (!list_empty(&q)) { - struct cpuset *cp; - struct cgroup *cont; - struct cpuset *child; - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); + struct cpuset *cp; + struct cgroup *pos_cgrp; - if (cpumask_empty(cp->cpus_allowed)) + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + /* skip the whole subtree if @cp doesn't have any CPU */ + if (cpumask_empty(cp->cpus_allowed)) { + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); continue; + } if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); - - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } } + rcu_read_unlock(); } /* @@ -520,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cgroup_lock held. + * Must be called with cpuset_mutex held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -558,7 +591,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -567,6 +599,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ + struct cgroup *pos_cgrp; doms = NULL; dattr = NULL; @@ -594,33 +627,27 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; csn = 0; - list_add(&top_cpuset.stack_list, &q); - while (!list_empty(&q)) { - struct cgroup *cont; - struct cpuset *child; /* scans child cpusets of cp */ - - cp = list_first_entry(&q, struct cpuset, stack_list); - list_del(q.next); - - if (cpumask_empty(cp->cpus_allowed)) - continue; - + rcu_read_lock(); + cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { /* - * All child cpusets contain a subset of the parent's cpus, so - * just skip them, and then we call update_domain_attr_tree() - * to calc relax_domain_level of the corresponding sched - * domain. + * Continue traversing beyond @cp iff @cp has some CPUs and + * isn't load balancing. The former is obvious. The + * latter: All child cpusets contain a subset of the + * parent's cpus, so just skip them, and then we call + * update_domain_attr_tree() to calc relax_domain_level of + * the corresponding sched domain. */ - if (is_sched_load_balance(cp)) { - csa[csn++] = cp; + if (!cpumask_empty(cp->cpus_allowed) && + !is_sched_load_balance(cp)) continue; - } - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &q); - } - } + if (is_sched_load_balance(cp)) + csa[csn++] = cp; + + /* skip @cp's subtree */ + pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + } + rcu_read_unlock(); for (i = 0; i < csn; i++) csa[i]->pn = i; @@ -725,82 +752,50 @@ done: /* * Rebuild scheduler domains. * - * Call with neither cgroup_mutex held nor within get_online_cpus(). - * Takes both cgroup_mutex and get_online_cpus(). + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. * - * Cannot be directly called from cpuset code handling changes - * to the cpuset pseudo-filesystem, because it cannot be called - * from code that already holds cgroup_mutex. + * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void do_rebuild_sched_domains(struct work_struct *unused) +static void rebuild_sched_domains_locked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + lockdep_assert_held(&cpuset_mutex); get_online_cpus(); + /* + * We have raced with CPU hotplug. Don't do anything to avoid + * passing doms with offlined cpu to partition_sched_domains(). + * Anyways, hotplug work item will rebuild sched domains. + */ + if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + goto out; + /* Generate domain masks and attrs */ - cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - +out: put_online_cpus(); } #else /* !CONFIG_SMP */ -static void do_rebuild_sched_domains(struct work_struct *unused) -{ -} - -static int generate_sched_domains(cpumask_var_t **domains, - struct sched_domain_attr **attributes) +static void rebuild_sched_domains_locked(void) { - *domains = NULL; - return 1; } #endif /* CONFIG_SMP */ -static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); - -/* - * Rebuild scheduler domains, asynchronously via workqueue. - * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. - * - * The rebuild_sched_domains() and partition_sched_domains() - * routines must nest cgroup_lock() inside get_online_cpus(), - * but such cpuset changes as these must nest that locking the - * other way, holding cgroup_lock() for much of the code. - * - * So in order to avoid an ABBA deadlock, the cpuset code handling - * these user changes delegates the actual sched domain rebuilding - * to a separate workqueue thread, which ends up processing the - * above do_rebuild_sched_domains() function. - */ -static void async_rebuild_sched_domains(void) -{ - queue_work(cpuset_wq, &rebuild_sched_domains_work); -} - -/* - * Accomplishes the same scheduler domain rebuild as the above - * async_rebuild_sched_domains(), however it directly calls the - * rebuild routine synchronously rather than calling it via an - * asynchronous work thread. - * - * This can only be called from code that is not holding - * cgroup_mutex (not nested in a cgroup_lock() call.) - */ void rebuild_sched_domains(void) { - do_rebuild_sched_domains(NULL); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_locked(); + mutex_unlock(&cpuset_mutex); } /** @@ -808,7 +803,7 @@ void rebuild_sched_domains(void) * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). @@ -829,7 +824,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -842,7 +837,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -920,7 +915,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, heap_free(&heap); if (is_load_balanced) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); return 0; } @@ -932,7 +927,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cgroup_mutex, so current's cpuset won't change + * Call holding cpuset_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing @@ -1007,7 +1002,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cgroup_mutex held. + * memory_migrate flag is set. Called with cpuset_mutex held. */ static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) @@ -1016,7 +1011,7 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - static nodemask_t newmems; /* protected by cgroup_mutex */ + static nodemask_t newmems; /* protected by cpuset_mutex */ cs = cgroup_cs(scan->cg); guarantee_online_mems(cs, &newmems); @@ -1043,7 +1038,7 @@ static void *cpuset_being_rebound; * @oldmem: old mems_allowed of cpuset cs * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */ @@ -1065,7 +1060,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cgroup_mutex, we know that no other rebind effort + * the global cpuset_mutex, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1084,7 +1079,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cgroup_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1168,7 +1163,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); } return 0; @@ -1182,7 +1177,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cgroup_lock() at this point. + * holding cpuset_mutex at this point. */ static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) @@ -1195,7 +1190,7 @@ static void cpuset_change_flag(struct task_struct *tsk, * @cs: the cpuset in which each task's spread flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * - * Called with cgroup_mutex held + * Called with cpuset_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. @@ -1220,7 +1215,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cgroup_mutex held. + * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1260,7 +1255,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - async_rebuild_sched_domains(); + rebuild_sched_domains_locked(); if (spread_flag_changed) update_tasks_flags(cs, &heap); @@ -1368,54 +1363,68 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* - * Protected by cgroup_lock. The nodemasks must be stored globally because - * dynamically allocating them is not allowed in can_attach, and they must - * persist until attach. - */ -static cpumask_var_t cpus_attach; -static nodemask_t cpuset_attach_nodemask_from; -static nodemask_t cpuset_attach_nodemask_to; - -/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; int ret; + mutex_lock(&cpuset_mutex); + + ret = -ENOSPC; if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) - return -ENOSPC; + goto out_unlock; cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ - if (task->flags & PF_THREAD_BOUND) - return -EINVAL; - if ((ret = security_task_setscheduler(task))) - return ret; + ret = -EINVAL; + if (task->flags & PF_NO_SETAFFINITY) + goto out_unlock; + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; } - /* prepare for attach */ - if (cs == &top_cpuset) - cpumask_copy(cpus_attach, cpu_possible_mask); - else - guarantee_online_cpus(cs, cpus_attach); - - guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + /* + * Mark attach is in progress. This makes validate_change() fail + * changes which zero cpus/mems_allowed. + */ + cs->attach_in_progress++; + ret = 0; +out_unlock: + mutex_unlock(&cpuset_mutex); + return ret; +} - return 0; +static void cpuset_cancel_attach(struct cgroup *cgrp, + struct cgroup_taskset *tset) +{ + mutex_lock(&cpuset_mutex); + cgroup_cs(cgrp)->attach_in_progress--; + mutex_unlock(&cpuset_mutex); } +/* + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * but we can't allocate it dynamically there. Define it global and + * allocate from cpuset_init(). + */ +static cpumask_var_t cpus_attach; + static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { + /* static bufs protected by cpuset_mutex */ + static nodemask_t cpuset_attach_nodemask_from; + static nodemask_t cpuset_attach_nodemask_to; struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); @@ -1423,6 +1432,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp); + mutex_lock(&cpuset_mutex); + + /* prepare for attach */ + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); + cgroup_taskset_for_each(task, cgrp, tset) { /* * can_attach beforehand should guarantee that this doesn't @@ -1448,6 +1467,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) &cpuset_attach_nodemask_to); mmput(mm); } + + cs->attach_in_progress--; + + /* + * We may have raced with CPU/memory hotunplug. Trigger hotplug + * propagation if @cs doesn't have any CPU or memory. It will move + * the newly added tasks to the nearest parent which can execute. + */ + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + schedule_cpuset_propagate_hotplug(cs); + + mutex_unlock(&cpuset_mutex); } /* The various types of files and directories in a cpuset file system */ @@ -1469,12 +1500,13 @@ typedef enum { static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1508,18 +1540,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: @@ -1529,7 +1563,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) retval = -EINVAL; break; } - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1539,17 +1574,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, const char *buf) { - int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *trialcs; + int retval = -ENODEV; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; + /* + * CPU or memory hotunplug may leave @cs w/o any execution + * resources, in which case the hotplug code asynchronously updates + * configuration and transfers all tasks to the nearest ancestor + * which can execute. + * + * As writes to "cpus" or "mems" may restore @cs's execution + * resources, wait for the previously scheduled operations before + * proceeding, so that we don't end up keep removing tasks added + * after execution capability is restored. + * + * Flushing cpuset_hotplug_work is enough to synchronize against + * hotplug hanlding; however, cpuset_attach() may schedule + * propagation work directly. Flush the workqueue too. + */ + flush_work(&cpuset_hotplug_work); + flush_workqueue(cpuset_propagate_hotplug_wq); + + mutex_lock(&cpuset_mutex); + if (!is_cpuset_online(cs)) + goto out_unlock; trialcs = alloc_trial_cpuset(cs); if (!trialcs) { retval = -ENOMEM; - goto out; + goto out_unlock; } switch (cft->private) { @@ -1565,8 +1619,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); -out: - cgroup_unlock(); +out_unlock: + mutex_unlock(&cpuset_mutex); return retval; } @@ -1790,15 +1844,12 @@ static struct cftype files[] = { static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cgroup *parent_cg = cont->parent; - struct cgroup *tmp_cg; - struct cpuset *parent, *cs; + struct cpuset *cs; - if (!parent_cg) + if (!cont->parent) return &top_cpuset.css; - parent = cgroup_cs(parent_cg); - cs = kmalloc(sizeof(*cs), GFP_KERNEL); + cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { @@ -1806,22 +1857,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) return ERR_PTR(-ENOMEM); } - cs->flags = 0; - if (is_spread_page(parent)) - set_bit(CS_SPREAD_PAGE, &cs->flags); - if (is_spread_slab(parent)) - set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); + INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); cs->relax_domain_level = -1; - cs->parent = parent; + return &cs->css; +} + +static int cpuset_css_online(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *parent = parent_cs(cs); + struct cpuset *tmp_cs; + struct cgroup *pos_cg; + + if (!parent) + return 0; + + mutex_lock(&cpuset_mutex); + + set_bit(CS_ONLINE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) - goto skip_clone; + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + goto out_unlock; /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is @@ -1836,35 +1903,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) * changed to grant parent->cpus_allowed-sibling_cpus_exclusive * (and likewise for mems) to the new cgroup. */ - list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { - struct cpuset *tmp_cs = cgroup_cs(tmp_cg); - - if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) - goto skip_clone; + rcu_read_lock(); + cpuset_for_each_child(tmp_cs, pos_cg, parent) { + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { + rcu_read_unlock(); + goto out_unlock; + } } + rcu_read_unlock(); mutex_lock(&callback_mutex); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); mutex_unlock(&callback_mutex); -skip_clone: - return &cs->css; +out_unlock: + mutex_unlock(&cpuset_mutex); + return 0; +} + +static void cpuset_css_offline(struct cgroup *cgrp) +{ + struct cpuset *cs = cgroup_cs(cgrp); + + mutex_lock(&cpuset_mutex); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + clear_bit(CS_ONLINE, &cs->flags); + + mutex_unlock(&cpuset_mutex); } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call async_rebuild_sched_domains(). + * will call rebuild_sched_domains_locked(). */ static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); - if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - number_of_cpusets--; free_cpumask_var(cs->cpus_allowed); kfree(cs); } @@ -1872,8 +1953,11 @@ static void cpuset_css_free(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .css_alloc = cpuset_css_alloc, + .css_online = cpuset_css_online, + .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, + .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .subsys_id = cpuset_subsys_id, .base_cftypes = files, @@ -1911,220 +1995,204 @@ int __init cpuset_init(void) return 0; } -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - struct cgroup *new_cgroup = scan->data; - - cgroup_attach_task(new_cgroup, tsk); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ - struct cgroup_scanner scan; - - scan.cg = from->css.cgroup; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cpuset_do_move_task; - scan.heap = NULL; - scan.data = to->css.cgroup; - - if (cgroup_scan_tasks(&scan)) - printk(KERN_ERR "move_member_tasks_to_cpuset: " - "cgroup_scan_tasks failed\n"); -} - /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. - * - * Called with cgroup_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { struct cpuset *parent; /* - * The cgroup's css_sets list is in use if there are tasks - * in the cpuset; the list is empty if there are none; - * the cs->css.refcnt seems always 0. - */ - if (list_empty(&cs->css.cgroup->css_sets)) - return; - - /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ - parent = cs->parent; + parent = parent_cs(cs); while (cpumask_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) - parent = parent->parent; + parent = parent_cs(parent); - move_member_tasks_to_cpuset(cs, parent); + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + rcu_read_lock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", + cgroup_name(cs->css.cgroup)); + rcu_read_unlock(); + } } -/* - * Helper function to traverse cpusets. - * It can be used to walk the cpuset tree from top to bottom, completing - * one layer before dropping down to the next (thus always processing a - * node before any of its children). +/** + * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset + * @cs: cpuset in interest + * + * Compare @cs's cpu and mem masks against top_cpuset and if some have gone + * offline, update @cs accordingly. If @cs ends up with no CPU or memory, + * all its tasks are moved to the nearest ancestor with both resources. */ -static struct cpuset *cpuset_next(struct list_head *queue) +static void cpuset_propagate_hotplug_workfn(struct work_struct *work) { - struct cpuset *cp; - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + static cpumask_t off_cpus; + static nodemask_t off_mems, tmp_mems; + struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); + bool is_empty; - if (list_empty(queue)) - return NULL; + mutex_lock(&cpuset_mutex); + + cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); + nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); + + /* remove offline cpus from @cs */ + if (!cpumask_empty(&off_cpus)) { + mutex_lock(&callback_mutex); + cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); + mutex_unlock(&callback_mutex); + update_tasks_cpumask(cs, NULL); + } - cp = list_first_entry(queue, struct cpuset, stack_list); - list_del(queue->next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, queue); + /* remove offline mems from @cs */ + if (!nodes_empty(off_mems)) { + tmp_mems = cs->mems_allowed; + mutex_lock(&callback_mutex); + nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); + mutex_unlock(&callback_mutex); + update_tasks_nodemask(cs, &tmp_mems, NULL); } - return cp; + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + mutex_unlock(&cpuset_mutex); + + /* + * If @cs became empty, move tasks to the nearest ancestor with + * execution resources. This is full cgroup operation which will + * also call back into cpuset. Should be done outside any lock. + */ + if (is_empty) + remove_tasks_in_empty_cpuset(cs); + + /* the following may free @cs, should be the last operation */ + css_put(&cs->css); } +/** + * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset + * @cs: cpuset of interest + * + * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and + * memory masks according to top_cpuset. + */ +static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) +{ + /* + * Pin @cs. The refcnt will be released when the work item + * finishes executing. + */ + if (!css_tryget(&cs->css)) + return; -/* - * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory - * online/offline) and update the cpusets accordingly. - * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such - * cpuset must be moved to a parent cpuset. + /* + * Queue @cs->hotplug_work. If already pending, lose the css ref. + * cpuset_propagate_hotplug_wq is ordered and propagation will + * happen in the order this function is called. + */ + if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) + css_put(&cs->css); +} + +/** + * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * - * Called with cgroup_mutex held. We take callback_mutex to modify - * cpus_allowed and mems_allowed. + * This function is called after either CPU or memory configuration has + * changed and updates cpuset accordingly. The top_cpuset is always + * synchronized to cpu_active_mask and N_MEMORY, which is necessary in + * order to make cpusets transparent (of no affect) on systems that are + * actively using CPU hotplug but making no active use of cpusets. * - * This walk processes the tree from top to bottom, completing one layer - * before dropping down to the next. It always processes a node before - * any of its children. + * Non-root cpusets are only affected by offlining. If any CPUs or memory + * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all + * descendants. * - * In the case of memory hot-unplug, it will remove nodes from N_MEMORY - * if all present pages from a node are offlined. + * Note that CPU offlining during suspend is ignored. We don't modify + * cpusets across suspend/resume cycles at all. */ -static void -scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) +static void cpuset_hotplug_workfn(struct work_struct *work) { - LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - static nodemask_t oldmems; /* protected by cgroup_mutex */ + static cpumask_t new_cpus, tmp_cpus; + static nodemask_t new_mems, tmp_mems; + bool cpus_updated, mems_updated; + bool cpus_offlined, mems_offlined; - list_add_tail((struct list_head *)&root->stack_list, &queue); + mutex_lock(&cpuset_mutex); - switch (event) { - case CPUSET_CPU_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* fetch the available cpus/mems and find out which changed how */ + cpumask_copy(&new_cpus, cpu_active_mask); + new_mems = node_states[N_MEMORY]; - /* Continue past cpusets with all cpus online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) - continue; + cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); + cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, + &new_cpus); - /* Remove offline cpus from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - mutex_unlock(&callback_mutex); + mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); + nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); + mems_offlined = !nodes_empty(tmp_mems); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_cpumask(cp, NULL); - } - break; + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + mutex_unlock(&callback_mutex); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } - case CPUSET_MEM_OFFLINE: - while ((cp = cpuset_next(&queue)) != NULL) { + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { + tmp_mems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = new_mems; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); + } - /* Continue past cpusets with all mems online */ - if (nodes_subset(cp->mems_allowed, - node_states[N_MEMORY])) - continue; + /* if cpus or mems went down, we need to propagate to descendants */ + if (cpus_offlined || mems_offlined) { + struct cpuset *cs; + struct cgroup *pos_cgrp; - oldmems = cp->mems_allowed; + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) + schedule_cpuset_propagate_hotplug(cs); + rcu_read_unlock(); + } - /* Remove offline mems from this cpuset. */ - mutex_lock(&callback_mutex); - nodes_and(cp->mems_allowed, cp->mems_allowed, - node_states[N_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&cpuset_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else - update_tasks_nodemask(cp, &oldmems, NULL); - } - } + /* wait for propagations to finish */ + flush_workqueue(cpuset_propagate_hotplug_wq); + + /* rebuild sched domains if cpus_allowed has changed */ + if (cpus_updated) + rebuild_sched_domains(); } -/* - * The top_cpuset tracks what CPUs and Memory Nodes are online, - * period. This is necessary in order to make cpusets transparent - * (of no affect) on systems that are actively using CPU hotplug - * but making no active use of cpusets. - * - * The only exception to this is suspend/resume, where we don't - * modify cpusets at all. - * - * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_active_mask on each CPU hotplug (cpuhp) event. - * - * Called within get_online_cpus(). Needs to call cgroup_lock() - * before calling generate_sched_domains(). - * - * @cpu_online: Indicates whether this is a CPU online event (true) or - * a CPU offline event (false). - */ void cpuset_update_active_cpus(bool cpu_online) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - cgroup_lock(); - mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); - mutex_unlock(&callback_mutex); - - if (!cpu_online) - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); - - ndoms = generate_sched_domains(&doms, &attr); - cgroup_unlock(); - - /* Have scheduler rebuild the domains */ - partition_sched_domains(ndoms, doms, attr); + /* + * We're inside cpu hotplug critical region which usually nests + * inside cgroup synchronization. Bounce actual hotplug processing + * to a work item to avoid reverse locking order. + * + * We still need to do partition_sched_domains() synchronously; + * otherwise, the scheduler will get confused and put tasks to the + * dead CPU. Fall back to the default single domain. + * cpuset_hotplug_workfn() will rebuild it as necessary. + */ + partition_sched_domains(1, NULL, NULL); + schedule_work(&cpuset_hotplug_work); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2133,48 +2201,30 @@ void cpuset_update_active_cpus(bool cpu_online) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - static nodemask_t oldmems; /* protected by cgroup_mutex */ - - cgroup_lock(); - switch (action) { - case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; - mutex_lock(&callback_mutex); - top_cpuset.mems_allowed = node_states[N_MEMORY]; - mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); - break; - case MEM_OFFLINE: - /* - * needn't update top_cpuset.mems_allowed explicitly because - * scan_cpusets_upon_hotplug() will update it. - */ - scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); - break; - default: - break; - } - cgroup_unlock(); - + schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { + .notifier_call = cpuset_track_online_nodes, + .priority = 10, /* ??! */ +}; /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); - cpuset_wq = create_singlethread_workqueue("cpuset"); - BUG_ON(!cpuset_wq); + cpuset_propagate_hotplug_wq = + alloc_ordered_workqueue("cpuset_hotplug", 0); + BUG_ON(!cpuset_propagate_hotplug_wq); } /** @@ -2273,8 +2323,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) */ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) - cs = cs->parent; + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) + cs = parent_cs(cs); return cs; } @@ -2412,17 +2462,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) } /** - * cpuset_unlock - release lock on cpuset changes - * - * Undo the lock taken in a previous cpuset_lock() call. - */ - -void cpuset_unlock(void) -{ - mutex_unlock(&callback_mutex); -} - -/** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page * @@ -2497,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +#define CPUSET_NODELIST_LEN (256) + /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed * @task: pointer to task_struct of some task. @@ -2507,17 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { - struct dentry *dentry; + /* Statically allocated to prevent using excess stack. */ + static char cpuset_nodelist[CPUSET_NODELIST_LEN]; + static DEFINE_SPINLOCK(cpuset_buffer_lock); + + struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - dentry = task_cs(tsk)->css.cgroup->dentry; + rcu_read_lock(); spin_lock(&cpuset_buffer_lock); - snprintf(cpuset_name, CPUSET_NAME_LEN, - dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cpuset_name, cpuset_nodelist); + tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); + rcu_read_unlock(); } /* @@ -2560,10 +2606,10 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -static int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; @@ -2582,35 +2628,21 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; - retval = -EINVAL; - cgroup_lock(); + rcu_read_lock(); css = task_subsys_state(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + rcu_read_unlock(); if (retval < 0) - goto out_unlock; + goto out_put_task; seq_puts(m, buf); seq_putc(m, '\n'); -out_unlock: - cgroup_unlock(); +out_put_task: put_task_struct(tsk); out_free: kfree(buf); out: return retval; } - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task mems_allowed in /proc/<pid>/status file. */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 9a61738cefc8..0506d447aed2 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -29,6 +29,7 @@ */ #include <linux/pid_namespace.h> #include <linux/clocksource.h> +#include <linux/serial_core.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/console.h> @@ -774,7 +775,7 @@ static void sysrq_handle_dbg(int key) static struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, - .help_msg = "debug(G)", + .help_msg = "debug(g)", .action_msg = "DEBUG", }; #endif diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 3494c28a7e7a..2235967e78b0 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -72,6 +72,8 @@ extern int dbg_kdb_mode; #ifdef CONFIG_KGDB_KDB extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); +extern int kdb_common_init_state(struct kgdb_state *ks); +extern int kdb_common_deinit_state(void); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index ce615e064482..19d9a578c753 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/kgdb.h> #include <linux/kdb.h> +#include <linux/serial_core.h> #include <linux/reboot.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks) len = len / 2; remcom_out_buffer[len++] = 0; + kdb_common_init_state(ks); kdb_parse(remcom_out_buffer); + kdb_common_deinit_state(); + strcpy(remcom_out_buffer, "OK"); } break; diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 8418c2f8ec5d..70a504601dc3 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv) /* * kdb_ss * - * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) - * commands. + * Process the 'ss' (Single Step) command. * * ss - * ssb * * Parameters: * argc Argument count @@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv) * Outputs: * None. * Returns: - * KDB_CMD_SS[B] for success, a kdb error if failure. + * KDB_CMD_SS for success, a kdb error if failure. * Locking: * None. * Remarks: * * Set the arch specific option to trigger a debug trap after the next * instruction. - * - * For 'ssb', set the trace flag in the debug trap handler - * after printing the current insn and return directly without - * invoking the kdb command processor, until a branch instruction - * is encountered. */ static int kdb_ss(int argc, const char **argv) { - int ssb = 0; - - ssb = (strcmp(argv[0], "ssb") == 0); if (argc != 0) return KDB_ARGCOUNT; /* * Set trace flag and go. */ KDB_STATE_SET(DOING_SS); - if (ssb) { - KDB_STATE_SET(DOING_SSB); - return KDB_CMD_SSB; - } return KDB_CMD_SS; } @@ -561,8 +547,6 @@ void __init kdb_initbptab(void) kdb_register_repeat("ss", kdb_ss, "", "Single Step", 1, KDB_REPEAT_NO_ARGS); - kdb_register_repeat("ssb", kdb_ss, "", - "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); /* * Architecture dependent initialization. */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index be7b33b73d30..328d18ef31e4 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx); static struct kgdb_state *kdb_ks; +int kdb_common_init_state(struct kgdb_state *ks) +{ + kdb_initial_cpu = atomic_read(&kgdb_active); + kdb_current_task = kgdb_info[ks->cpu].task; + kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + return 0; +} + +int kdb_common_deinit_state(void) +{ + kdb_initial_cpu = -1; + kdb_current_task = NULL; + kdb_current_regs = NULL; + return 0; +} + int kdb_stub(struct kgdb_state *ks) { int error = 0; @@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks) } /* Set initial kdb state variables */ KDB_STATE_CLEAR(KGDB_TRANS); - kdb_initial_cpu = atomic_read(&kgdb_active); - kdb_current_task = kgdb_info[ks->cpu].task; - kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + kdb_common_init_state(ks); /* Remove any breakpoints as needed by kdb and clear single step */ kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); - KDB_STATE_CLEAR(DOING_SSB); KDB_STATE_SET(PAGER); /* zero out any offline cpu data */ for_each_present_cpu(i) { @@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks) * Upon exit from the kdb main loop setup break points and restart * the system based on the requested continue state */ - kdb_initial_cpu = -1; - kdb_current_task = NULL; - kdb_current_regs = NULL; + kdb_common_deinit_state(); KDB_STATE_CLEAR(PAGER); kdbnearsym_cleanup(); if (error == KDB_CMD_KGDB) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 8875254120b6..00eb8f7fbf41 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = { }; #undef KDBMSG -static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); +static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* @@ -175,7 +175,7 @@ static char *__env[] = { (char *)0, }; -static const int __nenv = (sizeof(__env) / sizeof(char *)); +static const int __nenv = ARRAY_SIZE(__env); struct task_struct *kdb_curr_task(int cpu) { @@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv) } if (argc != 3) return KDB_ARGCOUNT; - defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), - GFP_KDB); - if (!defcmd_set) { - kdb_printf("Could not allocate new defcmd_set entry for %s\n", - argv[1]); - defcmd_set = save_defcmd_set; + if (in_dbg_master()) { + kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } + defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), + GFP_KDB); + if (!defcmd_set) + goto fail_defcmd; memcpy(defcmd_set, save_defcmd_set, defcmd_set_count * sizeof(*defcmd_set)); - kfree(save_defcmd_set); s = defcmd_set + defcmd_set_count; memset(s, 0, sizeof(*s)); s->usable = 1; s->name = kdb_strdup(argv[1], GFP_KDB); + if (!s->name) + goto fail_name; s->usage = kdb_strdup(argv[2], GFP_KDB); + if (!s->usage) + goto fail_usage; s->help = kdb_strdup(argv[3], GFP_KDB); + if (!s->help) + goto fail_help; if (s->usage[0] == '"') { - strcpy(s->usage, s->usage+1); + strcpy(s->usage, argv[2]+1); s->usage[strlen(s->usage)-1] = '\0'; } if (s->help[0] == '"') { - strcpy(s->help, s->help+1); + strcpy(s->help, argv[3]+1); s->help[strlen(s->help)-1] = '\0'; } ++defcmd_set_count; defcmd_in_progress = 1; + kfree(save_defcmd_set); return 0; +fail_help: + kfree(s->usage); +fail_usage: + kfree(s->name); +fail_name: + kfree(defcmd_set); +fail_defcmd: + kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]); + defcmd_set = save_defcmd_set; + return KDB_NOTIMP; } /* @@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p) * KDB_CMD_GO User typed 'go'. * KDB_CMD_CPU User switched to another cpu. * KDB_CMD_SS Single step. - * KDB_CMD_SSB Single step until branch. */ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_dbtrap_t db_result) @@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", instruction_pointer(regs)); break; - case KDB_DB_SSB: - /* - * In the midst of ssb command. Just return. - */ - KDB_DEBUG_STATE("kdb_local 3", reason); - return KDB_CMD_SSB; /* Continue with SSB command */ - - break; case KDB_DB_SS: break; case KDB_DB_SSBPT: @@ -1281,7 +1288,6 @@ do_full_getstr: if (diag == KDB_CMD_GO || diag == KDB_CMD_CPU || diag == KDB_CMD_SS - || diag == KDB_CMD_SSB || diag == KDB_CMD_KGDB) break; @@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, break; } - if (result == KDB_CMD_SSB) { - KDB_STATE_SET(DOING_SS); - KDB_STATE_SET(DOING_SSB); - break; - } - if (result == KDB_CMD_KGDB) { if (!KDB_STATE(DOING_KGDB)) kdb_printf("Entering please attach debugger " @@ -2350,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv) return 0; } -/* - * kdb_ll - This function implements the 'll' command which follows a - * linked list and executes an arbitrary command for each - * element. - */ -static int kdb_ll(int argc, const char **argv) -{ - int diag = 0; - unsigned long addr; - long offset = 0; - unsigned long va; - unsigned long linkoffset; - int nextarg; - const char *command; - - if (argc != 3) - return KDB_ARGCOUNT; - - nextarg = 1; - diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); - if (diag) - return diag; - - diag = kdbgetularg(argv[2], &linkoffset); - if (diag) - return diag; - - /* - * Using the starting address as - * the first element in the list, and assuming that - * the list ends with a null pointer. - */ - - va = addr; - command = kdb_strdup(argv[3], GFP_KDB); - if (!command) { - kdb_printf("%s: cannot duplicate command\n", __func__); - return 0; - } - /* Recursive use of kdb_parse, do not use argv after this point */ - argv = NULL; - - while (va) { - char buf[80]; - - if (KDB_FLAG(CMD_INTERRUPT)) - goto out; - - sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); - diag = kdb_parse(buf); - if (diag) - goto out; - - addr = va + linkoffset; - if (kdb_getword(&va, addr, sizeof(va))) - goto out; - } - -out: - kfree(command); - return diag; -} - static int kdb_kgdb(int argc, const char **argv) { return KDB_CMD_KGDB; @@ -2430,11 +2367,15 @@ static int kdb_help(int argc, const char **argv) kdb_printf("-----------------------------" "-----------------------------\n"); for_each_kdbcmd(kt, i) { - if (kt->cmd_name) - kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, - kt->cmd_usage, kt->cmd_help); + char *space = ""; if (KDB_FLAG(CMD_INTERRUPT)) return 0; + if (!kt->cmd_name) + continue; + if (strlen(kt->cmd_usage) > 20) + space = "\n "; + kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name, + kt->cmd_usage, space, kt->cmd_help); } return 0; } @@ -2739,7 +2680,7 @@ int kdb_register_repeat(char *cmd, (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); kfree(kdb_commands); } - memset(new + kdb_max_commands, 0, + memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0, kdb_command_extend * sizeof(*new)); kdb_commands = new; kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; @@ -2843,15 +2784,13 @@ static void __init kdb_inittab(void) "Stack traceback", 1, KDB_REPEAT_NONE); kdb_register_repeat("btp", kdb_bt, "<pid>", "Display stack for process <pid>", 0, KDB_REPEAT_NONE); - kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", - "Display stack all processes", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]", + "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE); kdb_register_repeat("btc", kdb_bt, "", "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); kdb_register_repeat("btt", kdb_bt, "<vaddr>", "Backtrace process given its struct task address", 0, KDB_REPEAT_NONE); - kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>", - "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); kdb_register_repeat("env", kdb_env, "", "Show environment variables", 0, KDB_REPEAT_NONE); kdb_register_repeat("set", kdb_set, "", diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 392ec6a25844..7afd3c8c41d5 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -19,7 +19,6 @@ #define KDB_CMD_GO (-1001) #define KDB_CMD_CPU (-1002) #define KDB_CMD_SS (-1003) -#define KDB_CMD_SSB (-1004) #define KDB_CMD_KGDB (-1005) /* Internal debug flags */ @@ -125,8 +124,6 @@ extern int kdb_state; * kdb control */ #define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ #define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ -#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, - * DOING_SS is also set */ #define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint * after one ss, independent of * DOING_SS */ @@ -191,7 +188,6 @@ extern void kdb_bp_remove(void); typedef enum { KDB_DB_BPT, /* Breakpoint */ KDB_DB_SS, /* Single-step trap */ - KDB_DB_SSB, /* Single step to branch */ KDB_DB_SSBPT, /* Single step over breakpoint */ KDB_DB_NOBPT /* Spurious breakpoint */ } kdb_dbtrap_t; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 418b3f7053aa..d473988c1d0b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) unsigned long long t2, t3; unsigned long flags; struct timespec ts; + cputime_t utime, stime, stimescaled, utimescaled; /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data @@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) goto done; tmp = (s64)d->cpu_run_real_total; - cputime_to_timespec(tsk->utime + tsk->stime, &ts); + task_cputime(tsk, &utime, &stime); + cputime_to_timespec(utime + stime, &ts); tmp += timespec_to_ns(&ts); d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; tmp = (s64)d->cpu_scaled_run_real_total; - cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + task_cputime_scaled(tsk, &utimescaled, &stimescaled); + cputime_to_timespec(utimescaled + stimescaled, &ts); tmp += timespec_to_ns(&ts); d->cpu_scaled_run_real_total = (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; diff --git a/kernel/events/core.c b/kernel/events/core.c index 7b6646a8c067..6b41c1899a8b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -18,6 +18,7 @@ #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> +#include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -37,6 +38,7 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> +#include <linux/cgroup.h> #include "internal.h" @@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF /* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. @@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - return !event->cgrp || event->cgrp == cpuctx->cgrp; + /* @event doesn't care about cgroup */ + if (!event->cgrp) + return true; + + /* wants specific cgroup scope but @cpuctx isn't associated with any */ + if (!cpuctx->cgrp) + return false; + + /* + * Cgroup scoping is recursive. An event enabled for a cgroup is + * also enabled for all its descendant cgroups. If @cpuctx's + * cgroup is a descendant of @event's (the test covers identity + * case), it's a match. + */ + return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, + event->cgrp->css.cgroup); } static inline bool perf_tryget_cgroup(struct perf_event *event) @@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) + if (list_empty(&cpuctx->rotation_list)) { + int was_empty = list_empty(head); list_add(&cpuctx->rotation_list, head); + if (was_empty) + tick_nohz_full_kick(); + } } static void get_ctx(struct perf_event_context *ctx) @@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -2555,6 +2596,16 @@ done: list_del_init(&cpuctx->rotation_list); } +#ifdef CONFIG_NO_HZ_FULL +bool perf_event_can_stop_tick(void) +{ + if (list_empty(&__get_cpu_var(rotation_list))) + return true; + else + return false; +} +#endif + void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); @@ -3691,7 +3742,7 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(filp); struct perf_event *event = filp->private_data; int retval; @@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, @@ -4434,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } @@ -4593,6 +4653,7 @@ void perf_event_comm(struct task_struct *task) struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (!ctx) @@ -4600,6 +4661,7 @@ void perf_event_comm(struct task_struct *task) perf_event_enable_on_exec(ctx); } + rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; @@ -4734,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } @@ -4761,6 +4824,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); @@ -5126,7 +5192,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; - struct hlist_node *node; struct hlist_head *head; rcu_read_lock(); @@ -5134,7 +5199,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (!head) goto end; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_event(event, nr, data, regs); } @@ -5328,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event) static int perf_swevent_init(struct perf_event *event) { - int event_id = event->attr.config; + u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; @@ -5419,7 +5484,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, { struct perf_sample_data data; struct perf_event *event; - struct hlist_node *node; struct perf_raw_record raw = { .size = entry_size, @@ -5429,7 +5493,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr, 0); data.raw = &raw; - hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } @@ -5649,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } @@ -5965,13 +6030,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type) pmu->name = name; if (type < 0) { - int err = idr_pre_get(&pmu_idr, GFP_KERNEL); - if (!err) - goto free_pdc; - - err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); - if (err) { - ret = err; + type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL); + if (type < 0) { + ret = type; goto free_pdc; } } @@ -5988,6 +6049,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; @@ -6171,11 +6233,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; + + if (attr->type == PERF_TYPE_TRACEPOINT) + event->hw.tp_target = task; #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * hw_breakpoint is a bit difficult here.. */ - if (attr->type == PERF_TYPE_BREAKPOINT) + else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif } @@ -7512,12 +7577,5 @@ struct cgroup_subsys perf_subsys = { .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, - - /* - * perf_event cgroup doesn't handle nesting correctly. - * ctx->nr_cgroups adjustments should be propagated through the - * cgroup hierarchy. Fix it and remove the following. - */ - .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fe8a916507ed..a64f8aeb5c1f 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); if (err_cpu == cpu) break; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8b..eb675c4d59df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer { int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ + int overwrite; /* can overwrite itself */ atomic_t poll; /* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff3973..cd55144270b5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@ static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, unsigned long offset, unsigned long head) { - unsigned long mask; + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; - if (!rb->writable) + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) return true; - mask = perf_data_size(rb) - 1; + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; + rb->overwrite = 0; + else + rb->overwrite = 1; atomic_set(&rb->refcount, 1); @@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb) } #else +static int data_page_nr(struct ring_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { - if (pgoff > (1UL << page_order(rb))) + /* The '>' counts in the user page. */ + if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); @@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work) int i, nr; rb = container_of(work, struct ring_buffer, work); - nr = 1 << page_order(rb); + nr = data_page_nr(rb); base = rb->user_page; - for (i = 0; i < nr + 1; i++) + /* The '<=' counts in the user page. */ + for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); @@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; rb->page_order = ilog2(nr_pages); - rb->nr_pages = 1; + rb->nr_pages = !!nr_pages; ring_buffer_init(rb, watermark, flags); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dea7acfbb071..f3569747d629 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> +#include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ #include <linux/swap.h> /* try_to_free_swap */ @@ -41,58 +42,31 @@ #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; - -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ - -#define UPROBES_HASH_SZ 13 - /* - * We need separate register/unregister and mmap/munmap lock hashes because - * of mmap_sem nesting. - * - * uprobe_register() needs to install probes on (potentially) all processes - * and thus needs to acquire multiple mmap_sems (consequtively, not - * concurrently), whereas uprobe_mmap() is called while holding mmap_sem - * for the particular process doing the mmap. - * - * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem - * because of lock order against i_mmap_mutex. This means there's a hole in - * the register vma iteration where a mmap() can happen. - * - * Thus uprobe_register() can race with uprobe_mmap() and we can try and - * install a probe where one is already installed. + * allows us to skip the uprobe_mmap if there are no uprobe events active + * at this time. Probably a fine grained per inode count is better? */ +#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -/* serialize (un)register */ -static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; - -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +#define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) static struct percpu_rw_semaphore dup_mmap_sem; -/* - * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe - * events active at this time. Probably a fine grained per inode count is - * better? - */ -static atomic_t uprobe_events = ATOMIC_INIT(0); - /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBE_RUN_HANDLER 1 /* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 2 +#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; + struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; - struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */ struct list_head pending_list; struct uprobe_consumer *consumers; struct inode *inode; /* Also hold a ref to inode */ @@ -101,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -199,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); - memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); + kunmap_atomic(kaddr); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } @@ -211,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; - copy_opcode(page, vaddr, &old_opcode); + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -230,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -251,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; int ret; @@ -272,15 +284,8 @@ retry: __SetPageUptodate(new_page); - /* copy the page now that we've got it stable */ - vaddr_old = kmap_atomic(old_page); - vaddr_new = kmap_atomic(new_page); - - memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - - kunmap_atomic(vaddr_new); - kunmap_atomic(vaddr_old); + copy_highpage(new_page, old_page); + copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = anon_vma_prepare(vma); if (ret) @@ -430,9 +435,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock(&uprobes_treelock); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - return u; } @@ -452,8 +454,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; + init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - mutex_init(&uprobe->copy_mutex); + /* For now assume that the instruction need not be single-stepped */ + __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -463,38 +467,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else { - atomic_inc(&uprobe_events); } return uprobe; } -static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) -{ - struct uprobe_consumer *uc; - - if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags)) - return; - - down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - if (!uc->filter || uc->filter(uc, current)) - uc->handler(uc, regs); - } - up_read(&uprobe->consumer_rwsem); -} - -/* Returns the previous consumer */ -static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); uc->next = uprobe->consumers; uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - - return uc->next; } /* @@ -525,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, loff_t offset) { struct page *page; - void *vaddr; - unsigned long off; - pgoff_t idx; - - if (!filp) - return -EINVAL; if (!mapping->a_ops->readpage) return -EIO; - - idx = offset >> PAGE_CACHE_SHIFT; - off = offset & ~PAGE_MASK; - /* * Ensure that the page that has the original instruction is * populated and in page-cache. */ - page = read_mapping_page(mapping, idx, filp); + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); if (IS_ERR(page)) return PTR_ERR(page); - vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off, nbytes); - kunmap_atomic(vaddr); + copy_from_page(page, offset, insn, nbytes); page_cache_release(page); return 0; @@ -588,7 +559,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; - mutex_lock(&uprobe->copy_mutex); + /* TODO: move this into _register, until then we abuse this sem. */ + down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; @@ -597,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -612,7 +584,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: - mutex_unlock(&uprobe->copy_mutex); + up_write(&uprobe->consumer_rwsem); + + return ret; +} + +static inline bool consumer_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + return !uc->filter || uc->filter(uc, ctx, mm); +} + +static bool filter_chain(struct uprobe *uprobe, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct uprobe_consumer *uc; + bool ret = false; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + ret = consumer_filter(uc, ctx, mm); + if (ret) + break; + } + up_read(&uprobe->consumer_rwsem); return ret; } @@ -624,16 +619,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, bool first_uprobe; int ret; - /* - * If probe is being deleted, unregister thread could be done with - * the vma-rmap-walk through. Adding a probe now can be fatal since - * nobody will be able to cleanup. Also we could be from fork or - * mremap path, where the probe might have already been inserted. - * Hence behave as if probe already existed. - */ - if (!uprobe->consumers) - return 0; - ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; @@ -658,14 +643,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - /* can happen if uprobe_register() fails */ - if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) - return 0; - set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} /* * There could be threads that have already hit the breakpoint. They * will recheck the current insn and restart if find_uprobe() fails. @@ -673,12 +658,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { + if (WARN_ON(!uprobe_is_active(uprobe))) + return; + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock(&uprobes_treelock); + RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ iput(uprobe->inode); put_uprobe(uprobe); - atomic_dec(&uprobe_events); } struct map_info { @@ -764,8 +752,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) return curr; } -static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +static int +register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { + bool is_register = !!new; struct map_info *info; int err = 0; @@ -787,17 +777,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); - else - err |= remove_breakpoint(uprobe, mm, info->vaddr); + if (is_register) { + /* consult only the "caller", new consumer. */ + if (consumer_filter(new, + UPROBE_FILTER_REGISTER, mm)) + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + if (!filter_chain(uprobe, + UPROBE_FILTER_UNREGISTER, mm)) + err |= remove_breakpoint(uprobe, mm, info->vaddr); + } unlock: up_write(&mm->mmap_sem); @@ -810,17 +806,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) return err; } -static int __uprobe_register(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc) { - return register_for_each_vma(uprobe, true); + consumer_add(uprobe, uc); + return register_for_each_vma(uprobe, uc); } -static void __uprobe_unregister(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { - if (!register_for_each_vma(uprobe, false)) - delete_uprobe(uprobe); + int err; + if (!consumer_del(uprobe, uc)) /* WARN? */ + return; + + err = register_for_each_vma(uprobe, NULL); /* TODO : cant unregister? schedule a worker thread */ + if (!uprobe->consumers && !err) + delete_uprobe(uprobe); } /* @@ -845,31 +847,63 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; - if (!inode || !uc || uc->next) + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) return -EINVAL; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; - ret = 0; - mutex_lock(uprobes_hash(inode)); + retry: uprobe = alloc_uprobe(inode, offset); - - if (!uprobe) { - ret = -ENOMEM; - } else if (!consumer_add(uprobe, uc)) { - ret = __uprobe_register(uprobe); - if (ret) { - uprobe->consumers = NULL; - __uprobe_unregister(uprobe); - } else { - set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } + if (!uprobe) + return -ENOMEM; + /* + * We can race with uprobe_unregister()->delete_uprobe(). + * Check uprobe_is_active() and retry if it is false. + */ + down_write(&uprobe->register_rwsem); + ret = -EAGAIN; + if (likely(uprobe_is_active(uprobe))) { + ret = __uprobe_register(uprobe, uc); + if (ret) + __uprobe_unregister(uprobe, uc); } + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + if (unlikely(ret == -EAGAIN)) + goto retry; + return ret; +} +EXPORT_SYMBOL_GPL(uprobe_register); + +/* + * uprobe_apply - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @uc: consumer which wants to add more or remove some breakpoints + * @add: add or remove the breakpoints + */ +int uprobe_apply(struct inode *inode, loff_t offset, + struct uprobe_consumer *uc, bool add) +{ + struct uprobe *uprobe; + struct uprobe_consumer *con; + int ret = -ENOENT; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return ret; + + down_write(&uprobe->register_rwsem); + for (con = uprobe->consumers; con && con != uc ; con = con->next) + ; + if (con) + ret = register_for_each_vma(uprobe, add ? uc : NULL); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); return ret; } @@ -884,25 +918,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume { struct uprobe *uprobe; - if (!inode || !uc) - return; - uprobe = find_uprobe(inode, offset); if (!uprobe) return; - mutex_lock(uprobes_hash(inode)); + down_write(&uprobe->register_rwsem); + __uprobe_unregister(uprobe, uc); + up_write(&uprobe->register_rwsem); + put_uprobe(uprobe); +} +EXPORT_SYMBOL_GPL(uprobe_unregister); - if (consumer_del(uprobe, uc)) { - if (!uprobe->consumers) { - __uprobe_unregister(uprobe); - clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); - } +static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int err = 0; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long vaddr; + loff_t offset; + + if (!valid_vma(vma, false) || + file_inode(vma->vm_file) != uprobe->inode) + continue; + + offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + if (uprobe->offset < offset || + uprobe->offset >= offset + vma->vm_end - vma->vm_start) + continue; + + vaddr = offset_to_vaddr(vma, uprobe->offset); + err |= remove_breakpoint(uprobe, mm, vaddr); } + up_read(&mm->mmap_sem); - mutex_unlock(uprobes_hash(inode)); - if (uprobe) - put_uprobe(uprobe); + return err; } static struct rb_node * @@ -979,18 +1030,23 @@ int uprobe_mmap(struct vm_area_struct *vma) struct uprobe *uprobe, *u; struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + if (no_uprobe_events() || !valid_vma(vma, true)) return 0; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - + /* + * We can race with uprobe_unregister(), this uprobe can be already + * removed. But in this case filter_chain() must return false, all + * consumers have gone away. + */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@ -1008,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e struct inode *inode; struct rb_node *n; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; @@ -1025,7 +1081,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) + if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ @@ -1042,22 +1098,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct xol_area *area) { - struct mm_struct *mm; - int ret; - - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) - return -ENOMEM; - - ret = -EALREADY; - mm = current->mm; + struct mm_struct *mm = current->mm; + int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (area->vaddr & ~PAGE_MASK) { @@ -1073,54 +1121,59 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; ret = 0; - -fail: + fail: up_write(&mm->mmap_sem); - if (ret) - __free_page(area->page); return ret; } -static struct xol_area *get_xol_area(struct mm_struct *mm) -{ - struct xol_area *area; - - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ - - return area; -} - /* - * xol_alloc_area - Allocate process's xol_area. - * This area will be used for storing instructions for execution out of - * line. + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ -static struct xol_area *xol_alloc_area(void) +static struct xol_area *get_xol_area(void) { + struct mm_struct *mm = current->mm; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; + + area = mm->uprobes_state.xol_area; + if (area) + goto ret; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) - return NULL; + goto out; area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); - if (!area->bitmap) - goto fail; + goto free_area; + + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) + goto free_bitmap; + /* allocate first slot of task's xol_area for the return probes */ + set_bit(0, area->bitmap); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) return area; -fail: + __free_page(area->page); + free_bitmap: kfree(area->bitmap); + free_area: kfree(area); - - return get_xol_area(current->mm); + out: + area = mm->uprobes_state.xol_area; + ret: + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + return area; } /* @@ -1186,43 +1239,31 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) } /* - * xol_get_insn_slot - If was not allocated a slot, then - * allocate a slot. + * xol_get_insn_slot - allocate a slot for xol. * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) +static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; - unsigned long offset; - void *vaddr; + unsigned long xol_vaddr; - area = get_xol_area(current->mm); - if (!area) { - area = xol_alloc_area(); - if (!area) - return 0; - } - current->utask->xol_vaddr = xol_take_insn_slot(area); + area = get_xol_area(); + if (!area) + return 0; - /* - * Initialize the slot if xol_vaddr points to valid - * instruction slot. - */ - if (unlikely(!current->utask->xol_vaddr)) + xol_vaddr = xol_take_insn_slot(area); + if (unlikely(!xol_vaddr)) return 0; - current->utask->vaddr = slot_addr; - offset = current->utask->xol_vaddr & ~PAGE_MASK; - vaddr = kmap_atomic(area->page); - memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); - kunmap_atomic(vaddr); + /* Initialize the slot */ + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. */ flush_dcache_page(area->page); - return current->utask->xol_vaddr; + return xol_vaddr; } /* @@ -1240,8 +1281,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) return; slot_addr = tsk->utask->xol_vaddr; - - if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr))) + if (unlikely(!slot_addr)) return; area = tsk->mm->uprobes_state.xol_area; @@ -1282,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1289,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1303,33 +1353,135 @@ void uprobe_copy_process(struct task_struct *t) } /* - * Allocate a uprobe_task object for the task. - * Called when the thread hits a breakpoint for the first time. + * Allocate a uprobe_task object for the task if if necessary. + * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ -static struct uprobe_task *add_utask(void) +static struct uprobe_task *get_utask(void) { + if (!current->utask) + current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + return current->utask; +} + +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ + struct xol_area *area; + unsigned long trampoline_vaddr = -1; + + area = current->mm->uprobes_state.xol_area; + smp_read_barrier_depends(); + if (area) + trampoline_vaddr = area->vaddr; + + return trampoline_vaddr; +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; - utask = kzalloc(sizeof *utask, GFP_KERNEL); - if (unlikely(!utask)) - return NULL; + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; - current->utask = utask; - return utask; + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + utask->depth++; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); } /* Prepare to single-step probed instruction out of line. */ static int -pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) - return 0; + struct uprobe_task *utask; + unsigned long xol_vaddr; + int err; - return -EFAULT; + utask = get_utask(); + if (!utask) + return -ENOMEM; + + xol_vaddr = xol_get_insn_slot(uprobe); + if (!xol_vaddr) + return -ENOMEM; + + utask->xol_vaddr = xol_vaddr; + utask->vaddr = bp_vaddr; + + err = arch_uprobe_pre_xol(&uprobe->arch, regs); + if (unlikely(err)) { + xol_free_insn_slot(current); + return err; + } + + utask->active_uprobe = uprobe; + utask->state = UTASK_SSTEP; + return 0; } /* @@ -1391,6 +1543,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. + * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; @@ -1399,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1417,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_opcode(page, vaddr, &opcode); + copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1433,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe(inode, offset); } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } @@ -1452,20 +1606,116 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + int rc = 0; + + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } + + if (uc->ret_handler) + need_prep = true; + + remove &= rc; + } + + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + + if (remove && uprobe->consumers) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + up_read(&uprobe->register_rwsem); +} + +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ + struct uprobe *uprobe = ri->uprobe; + struct uprobe_consumer *uc; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (uc->ret_handler) + uc->ret_handler(uc, ri->func, regs); + } + up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *tmp; + bool chained; + + utask = current->utask; + if (!utask) + return false; + + ri = utask->return_instances; + if (!ri) + return false; + + /* + * TODO: we should throw out return_instance's invalidated by + * longjmp(), currently we assume that the probed function always + * returns. + */ + instruction_pointer_set(regs, ri->orig_ret_vaddr); + + for (;;) { + handle_uretprobe_chain(ri, regs); + + chained = ri->chained; + put_uprobe(ri->uprobe); + + tmp = ri; + ri = ri->next; + kfree(tmp); + + if (!chained) + break; + + utask->depth--; + + BUG_ON(!ri); + } + + utask->return_instances = ri; + + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { - struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + if (bp_vaddr == get_trampoline_vaddr()) { + if (handle_trampoline(regs)) + return; + + pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + } + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1483,6 +1733,10 @@ static void handle_swbp(struct pt_regs *regs) } return; } + + /* change it in advance for ->handler() and restart */ + instruction_pointer_set(regs, bp_vaddr); + /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the @@ -1490,32 +1744,16 @@ static void handle_swbp(struct pt_regs *regs) */ smp_rmb(); /* pairs with wmb() in install_breakpoint() */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) - goto restart; - - utask = current->utask; - if (!utask) { - utask = add_utask(); - /* Cannot allocate; re-execute the instruction. */ - if (!utask) - goto restart; - } + goto out; handler_chain(uprobe, regs); if (can_skip_sstep(uprobe, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) { - utask->active_uprobe = uprobe; - utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - } -restart: - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* can_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } @@ -1576,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); @@ -1609,10 +1851,8 @@ static int __init init_uprobes(void) { int i; - for (i = 0; i < UPROBES_HASH_SZ; i++) { - mutex_init(&uprobes_mutex[i]); + for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - } if (percpu_init_rwsem(&dup_mmap_sem)) return -ENOMEM; diff --git a/kernel/exit.c b/kernel/exit.c index b4df21937216..af2eb3cbd499 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -20,6 +20,7 @@ #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> @@ -31,7 +32,6 @@ #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> -#include <linux/freezer.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk) bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *uninitialized_var(tty); + cputime_t utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); @@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime += tsk->utime; - sig->stime += tsk->stime; - sig->gtime += tsk->gtime; + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -483,7 +485,7 @@ static void exit_mm(struct task_struct * tsk) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!self.task) /* see coredump_finish() */ break; - schedule(); + freezable_schedule(); } __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); @@ -845,7 +847,7 @@ void do_exit(long code) exit_io_context(tsk); if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); + free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); @@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) sig = p->signal; psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; - psig->cgtime += p->gtime + sig->gtime + sig->cgtime; + psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += @@ -1627,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, } put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1667,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, ret = do_wait(&wo); put_pid(pid); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/extable.c b/kernel/extable.c index fe35a634bf76..67460b93b1a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) + if (main_extable_sort_needed) { + pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); - else - pr_notice("__ex_table already sorted, skipping sort\n"); + } } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index c535f33bbb9c..987b28a1f01b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -70,6 +70,7 @@ #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> +#include <linux/aio.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -413,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); @@ -1141,6 +1142,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -1230,9 +1234,15 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqlock_init(&p->vtime_seqlock); + p->vtime_snap = 0; + p->vtime_snap_whence = VTIME_SLEEPING; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif @@ -1294,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif +#ifdef CONFIG_BCACHE + p->sequential_io = 0; + p->sequential_io_avg = 0; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); @@ -1668,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); - asmlinkage_protect(5, ret, clone_flags, newsp, - parent_tidptr, child_tidptr, tls_val); - return ret; + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif @@ -1801,7 +1812,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * If unsharing a user namespace must also unshare the thread. */ if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; + unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing a pid namespace must also unshare the thread. */ @@ -1855,10 +1866,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) exit_sem(current); } - if (new_nsproxy) { + if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); - new_nsproxy = NULL; - } task_lock(current); @@ -1888,9 +1897,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } } - if (new_nsproxy) - put_nsproxy(new_nsproxy); - bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); diff --git a/kernel/futex.c b/kernel/futex.c index 19eb089ca003..b26dcfc02c94 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,7 @@ #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> +#include <linux/sched/rt.h> #include <asm/futex.h> @@ -222,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). * We can usually work out the index without swapping in the page. * @@ -704,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -1190,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * 1 - acquired the lock; * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1253,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1535,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1706,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1823,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2202,9 +2204,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2246,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2257,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2275,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, @@ -2471,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83e368b005fc..f9f44fd4d34d 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -11,6 +11,7 @@ #include <linux/nsproxy.h> #include <linux/futex.h> #include <linux/ptrace.h> +#include <linux/syscalls.h> #include <asm/uaccess.h> @@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr) } } -asmlinkage long -compat_sys_set_robust_list(struct compat_robust_list_head __user *head, - compat_size_t len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, + struct compat_robust_list_head __user *, head, + compat_size_t, len) { if (!futex_cmpxchg_enabled) return -ENOSYS; @@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head, return 0; } -asmlinkage long -compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, - compat_size_t __user *len_ptr) +COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + compat_uptr_t __user *, head_ptr, + compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; @@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, if (!futex_cmpxchg_enabled) return -ENOSYS; - WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); - rcu_read_lock(); ret = -ESRCH; @@ -172,9 +171,9 @@ err_unlock: return ret; } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - u32 val3) +COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct compat_timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) { struct timespec ts; ktime_t t, *tp = NULL; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..d4da55d1fb65 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6db7a5ed52b5..fd4b13b131f8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,8 @@ #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> #include <linux/timer.h> #include <asm/uaccess.h> @@ -61,6 +63,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { @@ -81,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, } }; @@ -88,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -104,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, mono, boot; struct timespec xts, tom, slp; + s32 tai_offset; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); xtim = timespec_to_ktime(xts); mono = ktime_add(xtim, timespec_to_ktime(tom)); @@ -113,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); } /* @@ -158,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, */ static int hrtimer_get_target(int this_cpu, int pinned) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif @@ -273,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } else { unsigned long rem = do_div(nsec, NSEC_PER_SEC); + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + tmp = ktime_set((long)nsec, rem); } @@ -640,29 +658,18 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) * and expiry check is done in the hrtimer_interrupt or in the softirq. */ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { - if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - if (wakeup) { - raw_spin_unlock(&base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - raw_spin_lock(&base->cpu_base->lock); - } else - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - - return 1; - } - - return 0; + return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot); + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); } /* @@ -735,8 +742,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base, - int wakeup) + struct hrtimer_clock_base *base) { return 0; } @@ -995,8 +1001,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * * XXX send_remote_softirq() ? */ - if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) - hrtimer_enqueue_reprogram(timer, new_base, wakeup); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) + && hrtimer_enqueue_reprogram(timer, new_base)) { + if (wakeup) { + /* + * We need to drop cpu_base->lock to avoid a + * lock ordering issue vs. rq->lock. + */ + raw_spin_unlock(&new_base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + local_irq_restore(flags); + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + } unlock_hrtimer_base(timer, &flags); @@ -1008,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1025,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1104,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * @@ -1307,6 +1328,8 @@ retry: expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; if (expires.tv64 < expires_next.tv64) expires_next = expires; break; @@ -1640,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; timerqueue_init_head(&cpu_base->clock_base[i].active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3aca9f29d30e..cbd97ce0b000 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data) EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, + struct msi_desc *entry) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->irq_data.msi_desc = entry; - if (entry) - entry->irq = irq; + if (entry && !irq_offset) + entry->irq = irq_base; irq_put_desc_unlock(desc, flags); return 0; } /** + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data + * + * Set the MSI descriptor entry for an irq + */ +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +{ + return irq_set_msi_desc_off(irq, 0, entry); +} + +/** * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 96f3a1d9c379..5a83dde8ca0c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. + * + * Since on some platforms we blindly try to map everything + * we end up with a log full of backtraces. + * + * So instead, we silently fail on -EPERM, it is the + * responsibility of the PIC driver to display a relevant + * message if needed. + */ + if (ret != -EPERM) { + pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", + virq, hwirq, ret); + WARN_ON(1); + } irq_data->domain = NULL; irq_data->hwirq = 0; goto err_unmap; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e49a288fa479..fa17855ca65a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/task_work.h> #include "internals.h" @@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type) out: irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(enable_percpu_irq); void disable_percpu_irq(unsigned int irq) { @@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq) irq_percpu_disable(desc, cpu); irq_put_desc_unlock(desc, flags); } +EXPORT_SYMBOL_GPL(disable_percpu_irq); /* * Internal function to unregister a percpu irqaction. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4bd4faa6323a..19ed5c425c3b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; + unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); cpumask_var_t new_value; int err; @@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_affinity_proc_fops = { @@ -212,7 +212,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE(inode)->data); + return single_open(file, default_affinity_show, PDE_DATA(inode)); } static const struct file_operations default_affinity_proc_fops = { @@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) static int irq_node_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_node_proc_show, PDE(inode)->data); + return single_open(file, irq_node_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_node_proc_fops = { @@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, PDE(inode)->data); + return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_spurious_proc_fops = { @@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) { - struct irq_desc *desc = irq_to_desc(irq); - - remove_proc_entry(action->dir->name, desc->dir); - } + proc_remove(action->dir); } static void register_default_affinity_proc(void) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 611cd6003c45..7b5f012bde9d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) /* * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. + * first. */ action = desc->action; if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER) || - (action->handler(irq, action->dev_id) == IRQ_HANDLED) || - !action->next) + (action->flags & __IRQF_TIMER)) goto out; /* Already running on another processor */ @@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) do { if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; + /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 1588e3b2871b..55fcce6065cf 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -12,37 +12,36 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/processor.h> -/* - * An entry can be in one of four states: - * - * free NULL, 0 -> {claimed} : free to be used - * claimed NULL, 3 -> {pending} : claimed to be enqueued - * pending next, 3 -> {busy} : queued, pending callback - * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed - */ - -#define IRQ_WORK_PENDING 1UL -#define IRQ_WORK_BUSY 2UL -#define IRQ_WORK_FLAGS 3UL static DEFINE_PER_CPU(struct llist_head, irq_work_list); +static DEFINE_PER_CPU(int, irq_work_raised); /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, nflags; + unsigned long flags, oflags, nflags; + /* + * Start with our best wish as a premise but only trust any + * flag value after cmpxchg() result. + */ + flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - flags = work->flags; - if (flags & IRQ_WORK_PENDING) - return false; nflags = flags | IRQ_WORK_FLAGS; - if (cmpxchg(&work->flags, flags, nflags) == flags) + oflags = cmpxchg(&work->flags, flags, nflags); + if (oflags == flags) break; + if (oflags & IRQ_WORK_PENDING) + return false; + flags = oflags; cpu_relax(); } @@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void) } /* - * Queue the entry and raise the IPI if needed. + * Enqueue the irq_work @entry unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. */ -static void __irq_work_queue(struct irq_work *work) +void irq_work_queue(struct irq_work *work) { - bool empty; + /* Only queue if not already pending */ + if (!irq_work_claim(work)) + return; + /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - /* The list was empty, raise self-interrupt to start processing. */ - if (empty) - arch_irq_work_raise(); + llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); + + /* + * If the work is not "lazy" or the tick is stopped, raise the irq + * work interrupt (if supported by the arch), otherwise, just wait + * for the next tick. + */ + if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { + if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) + arch_irq_work_raise(); + } preempt_enable(); } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Enqueue the irq_work @entry, returns true on success, failure when the - * @entry was already enqueued by someone else. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue(struct irq_work *work) +bool irq_work_needs_cpu(void) { - if (!irq_work_claim(work)) { - /* - * Already enqueued, can't do! - */ + struct llist_head *this_list; + + this_list = &__get_cpu_var(irq_work_list); + if (llist_empty(this_list)) return false; - } - __irq_work_queue(work); + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + return true; } -EXPORT_SYMBOL_GPL(irq_work_queue); -/* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. - */ -void irq_work_run(void) +static void __irq_work_run(void) { + unsigned long flags; struct irq_work *work; struct llist_head *this_list; struct llist_node *llnode; + + /* + * Reset the "raised" state right before we check the list because + * an NMI may enqueue after we find the list empty from the runner. + */ + __this_cpu_write(irq_work_raised, 0); + barrier(); + this_list = &__get_cpu_var(irq_work_list); if (llist_empty(this_list)) return; - BUG_ON(!in_irq()); BUG_ON(!irqs_disabled()); llnode = llist_del_all(this_list); @@ -119,16 +130,31 @@ void irq_work_run(void) /* * Clear the PENDING bit, after this point the @work * can be re-used. + * Make it immediately visible so that other CPUs trying + * to claim that work don't rely on us to handle their data + * while we are in the middle of the func. */ - work->flags = IRQ_WORK_BUSY; + flags = work->flags & ~IRQ_WORK_PENDING; + xchg(&work->flags, flags); + work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); + (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + BUG_ON(!in_irq()); + __irq_work_run(); +} EXPORT_SYMBOL_GPL(irq_work_run); /* @@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +#ifdef CONFIG_HOTPLUG_CPU +static int irq_work_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_DYING: + /* Called from stop_machine */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + break; + __irq_work_run(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cpu_notify; + +static __init int irq_work_init_cpu_notifier(void) +{ + cpu_notify.notifier_call = irq_work_cpu_notify; + cpu_notify.priority = 0; + register_cpu_notifier(&cpu_notify); + return 0; +} +device_initcall(irq_work_init_cpu_notifier); + +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2169feeba529..3127ad52cdb2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr) /* * Expand a compressed symbol data into the resulting uncompressed string, + * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +static unsigned int kallsyms_expand_symbol(unsigned int off, + char *result, size_t maxlen) { int len, skipped_first = 0; const u8 *tptr, *data; @@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) while (*tptr) { if (skipped_first) { + if (maxlen <= 1) + goto tail; *result = *tptr; result++; + maxlen--; } else skipped_first = 1; tptr++; } } - *result = '\0'; +tail: + if (maxlen) + *result = '\0'; /* Return to offset to the next symbol. */ return off; @@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned int off; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) return kallsyms_addresses[i]; @@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); if (ret != 0) return ret; @@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); + kallsyms_expand_symbol(get_symbol_offset(pos), + namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; return namebuf; @@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); + kallsyms_expand_symbol(get_symbol_offset(pos), + symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ @@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, pos = get_symbol_pos(addr, size, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); + kallsyms_expand_symbol(get_symbol_offset(pos), + name, KSYM_NAME_LEN); modname[0] = '\0'; return 0; } @@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) iter->type = kallsyms_get_symbol_type(off); - off = kallsyms_expand_symbol(off, iter->name); + off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 5e4bd7864c5d..59f7b55ba745 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -54,6 +54,12 @@ struct resource crashk_res = { .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource crashk_low_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; int kexec_should_crash(struct task_struct *p) { @@ -223,6 +229,8 @@ out: } +static void kimage_free_page_list(struct list_head *list); + static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments) @@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, if (result) goto out; - *rimage = image; - /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be @@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { printk(KERN_ERR "Could not allocate swap buffer\n"); - goto out; + goto out_free; } - result = 0; - out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kimage_free_page_list(&image->control_pages); + kfree(image); +out: return result; } @@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out; + goto out_free; } /* @@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); - goto out; + goto out_free; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + *rimage = image; + return 0; +out_free: + kfree(image); +out: return result; } @@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; - if (hole_end > crashk_res.end) - break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; @@ -783,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -816,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image, /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); result = copy_from_user(ptr, buf, uchunk); kunmap(page); @@ -847,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, * We do things a page at a time for the sake of kmap. */ unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -868,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ptr = kmap(page); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } @@ -1115,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, { unsigned long addr; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -1365,34 +1357,114 @@ static int __init parse_crashkernel_simple(char *cmdline, return 0; } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. */ -int __init parse_crashkernel(char *cmdline, +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, - unsigned long long *crash_base) + unsigned long long *crash_base, + const char *name, + const char *suffix) { - char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; + char *ck_cmdline; BUG_ON(!crash_size || !crash_base); *crash_size = 0; *crash_base = 0; - /* find crashkernel and use the last one if there are more */ - p = strstr(p, "crashkernel="); - while (p) { - ck_cmdline = p; - p = strstr(p+1, "crashkernel="); - } + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); if (!ck_cmdline) return -EINVAL; - ck_cmdline += 12; /* strlen("crashkernel=") */ + ck_cmdline += strlen(name); + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + crash_base, suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1409,6 +1481,36 @@ int __init parse_crashkernel(char *cmdline, return 0; } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} static void update_vmcoreinfo_note(void) { @@ -1431,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; - int r; + size_t r; va_start(args, fmt); r = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); @@ -1468,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); + VMCOREINFO_SYMBOL(vmap_area_list); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1490,6 +1591,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, _count); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1504,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); log_buf_kexec_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); @@ -1512,6 +1616,11 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/kernel/kfifo.c b/kernel/kfifo.c deleted file mode 100644 index 59dcf5b81d24..000000000000 --- a/kernel/kfifo.c +++ /dev/null @@ -1,609 +0,0 @@ -/* - * A generic kernel FIFO implementation - * - * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include <linux/kernel.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/log2.h> -#include <linux/uaccess.h> -#include <linux/kfifo.h> - -/* - * internal helper to calculate the unused elements in a fifo - */ -static inline unsigned int kfifo_unused(struct __kfifo *fifo) -{ - return (fifo->mask + 1) - (fifo->in - fifo->out); -} - -int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask) -{ - /* - * round down to the next power of 2, since our 'let the indices - * wrap' technique works only in this case. - */ - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - - if (size < 2) { - fifo->data = NULL; - fifo->mask = 0; - return -EINVAL; - } - - fifo->data = kmalloc(size * esize, gfp_mask); - - if (!fifo->data) { - fifo->mask = 0; - return -ENOMEM; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_alloc); - -void __kfifo_free(struct __kfifo *fifo) -{ - kfree(fifo->data); - fifo->in = 0; - fifo->out = 0; - fifo->esize = 0; - fifo->data = NULL; - fifo->mask = 0; -} -EXPORT_SYMBOL(__kfifo_free); - -int __kfifo_init(struct __kfifo *fifo, void *buffer, - unsigned int size, size_t esize) -{ - size /= esize; - - if (!is_power_of_2(size)) - size = rounddown_pow_of_two(size); - - fifo->in = 0; - fifo->out = 0; - fifo->esize = esize; - fifo->data = buffer; - - if (size < 2) { - fifo->mask = 0; - return -EINVAL; - } - fifo->mask = size - 1; - - return 0; -} -EXPORT_SYMBOL(__kfifo_init); - -static void kfifo_copy_in(struct __kfifo *fifo, const void *src, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(fifo->data + off, src, l); - memcpy(fifo->data, src + l, len - l); - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_in(struct __kfifo *fifo, - const void *buf, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - kfifo_copy_in(fifo, buf, len, fifo->in); - fifo->in += len; - return len; -} -EXPORT_SYMBOL(__kfifo_in); - -static void kfifo_copy_out(struct __kfifo *fifo, void *dst, - unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - memcpy(dst, fifo->data + off, l); - memcpy(dst + l, fifo->data, len - l); - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); -} - -unsigned int __kfifo_out_peek(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - kfifo_copy_out(fifo, buf, len, fifo->out); - return len; -} -EXPORT_SYMBOL(__kfifo_out_peek); - -unsigned int __kfifo_out(struct __kfifo *fifo, - void *buf, unsigned int len) -{ - len = __kfifo_out_peek(fifo, buf, len); - fifo->out += len; - return len; -} -EXPORT_SYMBOL(__kfifo_out); - -static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned int *copied) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned long ret; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_from_user(fifo->data + off, from, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_from_user(fifo->data, from + l, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data in the fifo is up to date before - * incrementing the fifo->in index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->in += len; - return err; -} -EXPORT_SYMBOL(__kfifo_from_user); - -static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, - unsigned int len, unsigned int off, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - ret = copy_to_user(to, fifo->data + off, l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret + len - l, esize); - else { - ret = copy_to_user(to + l, fifo->data, len - l); - if (unlikely(ret)) - ret = DIV_ROUND_UP(ret, esize); - } - /* - * make sure that the data is copied before - * incrementing the fifo->out index counter - */ - smp_wmb(); - *copied = len - ret; - /* return the number of elements which are not copied */ - return ret; -} - -int __kfifo_to_user(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied) -{ - unsigned int l; - unsigned long ret; - unsigned int esize = fifo->esize; - int err; - - if (esize != 1) - len /= esize; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); - if (unlikely(ret)) { - len -= ret; - err = -EFAULT; - } else - err = 0; - fifo->out += len; - return err; -} -EXPORT_SYMBOL(__kfifo_to_user); - -static int setup_sgl_buf(struct scatterlist *sgl, void *buf, - int nents, unsigned int len) -{ - int n; - unsigned int l; - unsigned int off; - struct page *page; - - if (!nents) - return 0; - - if (!len) - return 0; - - n = 0; - page = virt_to_page(buf); - off = offset_in_page(buf); - l = 0; - - while (len >= l + PAGE_SIZE - off) { - struct page *npage; - - l += PAGE_SIZE; - buf += PAGE_SIZE; - npage = virt_to_page(buf); - if (page_to_phys(page) != page_to_phys(npage) - l) { - sg_set_page(sgl, page, l - off, off); - sgl = sg_next(sgl); - if (++n == nents || sgl == NULL) - return n; - page = npage; - len -= l - off; - l = off = 0; - } - } - sg_set_page(sgl, page, len, off); - return n + 1; -} - -static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, - int nents, unsigned int len, unsigned int off) -{ - unsigned int size = fifo->mask + 1; - unsigned int esize = fifo->esize; - unsigned int l; - unsigned int n; - - off &= fifo->mask; - if (esize != 1) { - off *= esize; - size *= esize; - len *= esize; - } - l = min(len, size - off); - - n = setup_sgl_buf(sgl, fifo->data + off, nents, l); - n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - - return n; -} - -unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = kfifo_unused(fifo); - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->in); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare); - -unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len) -{ - unsigned int l; - - l = fifo->in - fifo->out; - if (len > l) - len = l; - - return setup_sgl(fifo, sgl, nents, len, fifo->out); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare); - -unsigned int __kfifo_max_r(unsigned int len, size_t recsize) -{ - unsigned int max = (1 << (recsize << 3)) - 1; - - if (len > max) - return max; - return len; -} -EXPORT_SYMBOL(__kfifo_max_r); - -#define __KFIFO_PEEK(data, out, mask) \ - ((data)[(out) & (mask)]) -/* - * __kfifo_peek_n internal helper function for determinate the length of - * the next record in the fifo - */ -static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) -{ - unsigned int l; - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - l = __KFIFO_PEEK(data, fifo->out, mask); - - if (--recsize) - l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; - - return l; -} - -#define __KFIFO_POKE(data, in, mask, val) \ - ( \ - (data)[(in) & (mask)] = (unsigned char)(val) \ - ) - -/* - * __kfifo_poke_n internal helper function for storeing the length of - * the record into the fifo - */ -static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) -{ - unsigned int mask = fifo->mask; - unsigned char *data = fifo->data; - - __KFIFO_POKE(data, fifo->in, mask, n); - - if (recsize > 1) - __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); -} - -unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) -{ - return __kfifo_peek_n(fifo, recsize); -} -EXPORT_SYMBOL(__kfifo_len_r); - -unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, - unsigned int len, size_t recsize) -{ - if (len + recsize > kfifo_unused(fifo)) - return 0; - - __kfifo_poke_n(fifo, len, recsize); - - kfifo_copy_in(fifo, buf, len, fifo->in + recsize); - fifo->in += len + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_in_r); - -static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, - void *buf, unsigned int len, size_t recsize, unsigned int *n) -{ - *n = __kfifo_peek_n(fifo, recsize); - - if (len > *n) - len = *n; - - kfifo_copy_out(fifo, buf, len, fifo->out + recsize); - return len; -} - -unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - return kfifo_out_copy_r(fifo, buf, len, recsize, &n); -} -EXPORT_SYMBOL(__kfifo_out_peek_r); - -unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, - unsigned int len, size_t recsize) -{ - unsigned int n; - - if (fifo->in == fifo->out) - return 0; - - len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); - fifo->out += n + recsize; - return len; -} -EXPORT_SYMBOL(__kfifo_out_r); - -void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int n; - - n = __kfifo_peek_n(fifo, recsize); - fifo->out += n + recsize; -} -EXPORT_SYMBOL(__kfifo_skip_r); - -int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) { - *copied = 0; - return 0; - } - - __kfifo_poke_n(fifo, len, recsize); - - ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->in += len + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_from_user_r); - -int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, - unsigned long len, unsigned int *copied, size_t recsize) -{ - unsigned long ret; - unsigned int n; - - if (fifo->in == fifo->out) { - *copied = 0; - return 0; - } - - n = __kfifo_peek_n(fifo, recsize); - if (len > n) - len = n; - - ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); - if (unlikely(ret)) { - *copied = 0; - return -EFAULT; - } - fifo->out += n + recsize; - return 0; -} -EXPORT_SYMBOL(__kfifo_to_user_r); - -unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > kfifo_unused(fifo)) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); - -void __kfifo_dma_in_finish_r(struct __kfifo *fifo, - unsigned int len, size_t recsize) -{ - len = __kfifo_max_r(len, recsize); - __kfifo_poke_n(fifo, len, recsize); - fifo->in += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_in_finish_r); - -unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, - struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) -{ - if (!nents) - BUG(); - - len = __kfifo_max_r(len, recsize); - - if (len + recsize > fifo->in - fifo->out) - return 0; - - return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); -} -EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); - -void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) -{ - unsigned int len; - - len = __kfifo_peek_n(fifo, recsize); - fifo->out += len + recsize; -} -EXPORT_SYMBOL(__kfifo_dma_out_finish_r); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0023a87e8de6..1296e72e4161 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -38,6 +38,7 @@ #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> +#include <linux/async.h> #include <asm/uaccess.h> #include <trace/events/module.h> @@ -76,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info) static int call_modprobe(char *module_name, int wait) { + struct subprocess_info *info; static char *envp[] = { "HOME=/", "TERM=linux", @@ -97,8 +99,15 @@ static int call_modprobe(char *module_name, int wait) argv[3] = module_name; /* check free_modprobe_argv() */ argv[4] = NULL; - return call_usermodehelper_fns(modprobe_path, argv, envp, - wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); + info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, + NULL, free_modprobe_argv, NULL); + if (!info) + goto free_module_name; + + return call_usermodehelper_exec(info, wait | UMH_KILLABLE); + +free_module_name: + kfree(module_name); free_argv: kfree(argv); out: @@ -130,6 +139,14 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + /* + * We don't allow synchronous module loading from async. Module + * init may invoke async_synchronize_full() which will end up + * waiting for this task which already is waiting for the module + * loading to complete, leading to a deadlock. + */ + WARN_ON_ONCE(wait && current_is_async()); + va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); @@ -493,14 +510,28 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. */ -static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask) + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) { struct subprocess_info *sub_info; sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); @@ -511,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; out: return sub_info; } - -/** - * call_usermodehelper_setfns - set a cleanup/init function - * @info: a subprocess_info returned by call_usermodehelper_setup - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -static -void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - info->cleanup = cleanup; - info->init = init; - info->data = data; -} +EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. - * when -1 don't wait at all, but you get no useful error back when - * the program couldn't be exec'ed. This makes it safe to call + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -606,31 +614,34 @@ unlock: helper_unlock(); return retval; } +EXPORT_SYMBOL(call_usermodehelper_exec); -/* - * call_usermodehelper_fns() will not run the caller-provided cleanup function - * if a memory allocation failure is experienced. So the caller might need to - * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform - * the necessaary cleanup within the caller. +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). */ -int call_usermodehelper_fns( - char *path, char **argv, char **envp, int wait, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data) +int call_usermodehelper(char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - info = call_usermodehelper_setup(path, argv, envp, gfp_mask); - + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); if (info == NULL) return -ENOMEM; - call_usermodehelper_setfns(info, init, cleanup, data); - return call_usermodehelper_exec(info, wait); } -EXPORT_SYMBOL(call_usermodehelper_fns); +EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 098f396aa409..3fed7f0cbcdf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void) struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (p->addr == addr) return p; } @@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); -static DECLARE_COMPLETION(optimizer_comp); #define OPTIMIZE_DELAY 5 /* @@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) /* Start optimizer after OPTIMIZE_DELAY passed */ static __kprobes void kick_kprobe_optimizer(void) { - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); } /* Kprobe jump optimizer */ @@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); - else - /* Wake up all waiters */ - complete_all(&optimizer_comp); } /* Wait for completing optimization and unoptimization */ static __kprobes void wait_for_kprobe_optimizer(void) { - if (delayed_work_pending(&optimizing_work)) - wait_for_completion(&optimizer_comp); + mutex_lock(&kprobe_mutex); + + while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { + mutex_unlock(&kprobe_mutex); + + /* this will also make optimizing_work execute immmediately */ + flush_delayed_work(&optimizing_work); + /* @optimizing_work might not have been queued yet, relax */ + cpu_relax(); + + mutex_lock(&kprobe_mutex); + } + + mutex_unlock(&kprobe_mutex); } /* Optimize kprobe if p is ready to be optimized */ @@ -788,53 +794,58 @@ out: } #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already allowed, just return */ if (kprobes_allow_optimization) - return; + goto out; kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } printk(KERN_INFO "Kprobes globally optimized\n"); +out: + mutex_unlock(&kprobe_mutex); } -/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already prohibited, just return */ - if (!kprobes_allow_optimization) + if (!kprobes_allow_optimization) { + mutex_unlock(&kprobe_mutex); return; + } kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!kprobe_disabled(p)) unoptimize_kprobe(p, false); } } + mutex_unlock(&kprobe_mutex); + /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); printk(KERN_INFO "Kprobes globally unoptimized\n"); } +static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -842,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, { int ret; - mutex_lock(&kprobe_mutex); + mutex_lock(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -850,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_mutex); + mutex_unlock(&kprobe_sysctl_mutex); return ret; } @@ -919,7 +930,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, @@ -964,7 +975,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); } -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) do {} while (0) #define disarm_kprobe_ftrace(p) do {} while (0) @@ -1141,7 +1152,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; + struct hlist_node *tmp; unsigned long hash, flags = 0; if (unlikely(!kprobes_initialized)) @@ -1152,12 +1163,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + hlist_for_each_entry_safe(ri, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1166,9 +1177,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { + hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { hlist_del(&ri->hlist); kfree(ri); } @@ -1178,14 +1189,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) { unsigned long flags, hash; struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + struct hlist_node *next; struct hlist_head *head; /* No race here */ for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { kretprobe_table_lock(hash, &flags); head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, pos, next, head, hlist) { + hlist_for_each_entry_safe(ri, next, head, hlist) { if (ri->rp == rp) ri->rp = NULL; } @@ -1414,12 +1425,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { -#ifdef KPROBES_CAN_USE_FTRACE +#ifdef CONFIG_KPROBES_ON_FTRACE /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; p->flags |= KPROBE_FLAG_FTRACE; -#else /* !KPROBES_CAN_USE_FTRACE */ +#else /* !CONFIG_KPROBES_ON_FTRACE */ return -EINVAL; #endif } @@ -2021,7 +2032,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, { struct module *mod = data; struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; int checkcore = (val == MODULE_STATE_GOING); @@ -2038,7 +2048,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (within_module_init((unsigned long)p->addr, mod) || (checkcore && within_module_core((unsigned long)p->addr, mod))) { @@ -2185,7 +2195,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; @@ -2194,7 +2203,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) head = &kprobe_table[i]; preempt_disable(); - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (kprobe_aggrprobe(p)) { @@ -2229,7 +2238,6 @@ static const struct file_operations debugfs_kprobes_operations = { static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2242,7 +2250,7 @@ static void __kprobes arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) + hlist_for_each_entry_rcu(p, head, hlist) if (!kprobe_disabled(p)) arm_kprobe(p); } @@ -2258,7 +2266,6 @@ already_enabled: static void __kprobes disarm_all_kprobes(void) { struct hlist_head *head; - struct hlist_node *node; struct kprobe *p; unsigned int i; @@ -2275,7 +2282,7 @@ static void __kprobes disarm_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, node, head, hlist) { + hlist_for_each_entry_rcu(p, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) disarm_kprobe(p, false); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..760e86df8c20 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> +#include <linux/uaccess.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -52,8 +53,21 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ + return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ + struct completion *vfork = ACCESS_ONCE(k->vfork_done); + if (likely(vfork)) + return __to_kthread(vfork); + return NULL; +} /** * kthread_should_stop - should this kthread return now? @@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +/** + * probe_kthread_data - speculative version of kthread_data() + * @task: possible kthread task in question + * + * @task could be a kthread task. Return the data value specified when it + * was created if accessible. If @task isn't a kthread task or its data is + * inaccessible for any reason, %NULL is returned. This function requires + * that @task itself is safe to dereference. + */ +void *probe_kthread_data(struct task_struct *task) +{ + struct kthread *kthread = to_kthread(task); + void *data = NULL; + + probe_kernel_read(&data, &kthread->data, sizeof(data)); + return data; +} + static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) complete(&self->parked); schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); } clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); @@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, state)) { + WARN_ON(1); + return; + } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** @@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - __kthread_bind(p, cpu); + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); @@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static struct kthread *task_get_live_kthread(struct task_struct *k) +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) { - struct kthread *kthread; - - get_task_struct(k); - kthread = to_kthread(k); - /* It might have exited */ - barrier(); - if (k->vfork_done != NULL) - return kthread; - return NULL; + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + wake_up_state(k, TASK_PARKED); + } } /** @@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) */ void kthread_unpark(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); - if (kthread) { - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. - */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu); - wake_up_process(k); - } - } - put_task_struct(k); + if (kthread) + __kthread_unpark(k, kthread); } /** @@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k) */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); int ret = -ENOSYS; if (kthread) { @@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k) } ret = 0; } - put_task_struct(k); return ret; } @@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k) */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread; int ret; trace_sched_kthread_stop(k); + + get_task_struct(k); + kthread = to_live_kthread(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); } ret = k->exit_code; - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); + trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7981e5b2350d..1f3186b37fd5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class) unsigned long nr_stack_trace_entries; static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ + printk(KERN_DEBUG "%s\n", bug_msg); + printk(KERN_DEBUG "turning off the locking correctness validator.\n"); + printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + static int save_trace(struct stack_trace *trace) { trace->nr_entries = 0; @@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace) if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); return 0; @@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) } raw_local_irq_restore(flags); - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); return NULL; } @@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); return NULL; } @@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; + struct held_lock *hlock_curr; int i, j; /* @@ -2048,8 +2052,7 @@ cache_hit: if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); return 0; } @@ -2057,12 +2060,10 @@ cache_hit: chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; /* Find the first held_lock of current chain */ - hlock_next = hlock; for (i = curr->lockdep_depth - 1; i >= 0; i--) { hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) + if (hlock_curr->irq_context != hlock->irq_context) break; - hlock_next = hlock; } i++; chain->depth = curr->lockdep_depth + 1 - i; @@ -2997,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static int print_lock_nested_lock_not_held(struct task_struct *curr, @@ -3190,9 +3192,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); + printk(KERN_DEBUG "depth: %i max: %lu!\n", + curr->lockdep_depth, MAX_LOCK_DEPTH); + + lockdep_print_held_locks(current); + debug_show_all_locks(); dump_stack(); + return 0; } @@ -3203,7 +3210,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } static int -print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, unsigned long ip) { if (!debug_locks_off()) @@ -3246,7 +3253,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 0; if (curr->lockdep_depth <= 0) - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); return 1; } @@ -3317,7 +3324,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: lockdep_init_map(lock, name, key, 0); @@ -3384,7 +3391,7 @@ lock_release_non_nested(struct task_struct *curr, goto found_it; prev_hlock = hlock; } - return print_unlock_inbalance_bug(curr, lock, ip); + return print_unlock_imbalance_bug(curr, lock, ip); found_it: if (hlock->instance == lock) diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 246b4c6e6135..4a9a86d12c8b 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S @@ -1,15 +1,8 @@ -/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ -#ifndef SYMBOL_PREFIX -#define ASM_SYMBOL(sym) sym -#else -#define PASTE2(x,y) x##y -#define PASTE(x,y) PASTE2(x,y) -#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) -#endif +#include <linux/export.h> #define GLOBAL(name) \ - .globl ASM_SYMBOL(name); \ - ASM_SYMBOL(name): + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): .section ".init.data","aw" diff --git a/kernel/module.c b/kernel/module.c index eab08274ec9b..b049939177f6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -197,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod) return -ENOENT; } -static inline void add_taint_module(struct module *mod, unsigned flag) +static inline void add_taint_module(struct module *mod, unsigned flag, + enum lockdep_ok lockdep_ok) { - add_taint(flag); + add_taint(flag, lockdep_ok); mod->taints |= (1U << flag); } @@ -727,7 +728,7 @@ static inline int try_force_unload(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - add_taint(TAINT_FORCED_RMMOD); + add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE); return ret; } #else @@ -1138,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason) if (!test_taint(TAINT_FORCED_MODULE)) printk(KERN_WARNING "%s: %s: kernel tainted.\n", mod->name, reason); - add_taint_module(mod, TAINT_FORCED_MODULE); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else return -ENOEXEC; @@ -1208,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, /* Since this should be found in kernel (which can't be removed), * no locking is necessary. */ - if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc, + return check_version(sechdrs, versindex, + VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); } @@ -1860,12 +1862,12 @@ static void free_module(struct module *mod) { trace_module_free(mod); - /* Delete from various lists */ - mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); - mutex_unlock(&module_mutex); mod_sysfs_teardown(mod); + /* We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. */ + mod->state = MODULE_STATE_UNFORMED; + /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1878,6 +1880,11 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + /* Now we can delete it from the lists */ + mutex_lock(&module_mutex); + stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); + /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); @@ -2147,7 +2154,8 @@ static void set_license(struct module *mod, const char *license) if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); } } @@ -2539,7 +2547,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) if (err) goto out; - err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); + err = vfs_getattr(&file->f_path, &stat); if (err) goto out; @@ -2700,10 +2708,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) } if (!get_modinfo(info, "intree")) - add_taint_module(mod, TAINT_OOT_MODULE); + add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); if (get_modinfo(info, "staging")) { - add_taint_module(mod, TAINT_CRAP); + add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); @@ -2869,15 +2877,17 @@ static int check_module_license_and_versions(struct module *mod) * using GPL-only symbols it needs. */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint(TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); /* lve claims to be GPL but upstream won't provide source */ if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) @@ -3141,12 +3151,72 @@ static int may_init_module(void) return 0; } +/* + * We try to place it in the list now to make sure it's unique before + * we dedicate too many resources. In particular, temporary percpu + * memory exhaustion. + */ +static int add_unformed_module(struct module *mod) +{ + int err; + struct module *old; + + mod->state = MODULE_STATE_UNFORMED; + +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto out_unlocked; + goto again; + } + err = -EEXIST; + goto out; + } + list_add_rcu(&mod->list, &modules); + err = 0; + +out: + mutex_unlock(&module_mutex); +out_unlocked: + return err; +} + +static int complete_formation(struct module *mod, struct load_info *info) +{ + int err; + + mutex_lock(&module_mutex); + + /* Find duplicate symbols (must be called under lock). */ + err = verify_export_symbols(mod); + if (err < 0) + goto out; + + /* This relies on module_mutex for list integrity. */ + module_bug_finalize(info->hdr, info->sechdrs, mod); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + +out: + mutex_unlock(&module_mutex); + return err; +} + /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ static int load_module(struct load_info *info, const char __user *uargs, int flags) { - struct module *mod, *old; + struct module *mod; long err; err = module_sig_check(info); @@ -3164,36 +3234,20 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } - /* - * We try to place it in the list now to make sure it's unique - * before we dedicate too many resources. In particular, - * temporary percpu memory exhaustion. - */ - mod->state = MODULE_STATE_UNFORMED; -again: - mutex_lock(&module_mutex); - if ((old = find_module_all(mod->name, true)) != NULL) { - if (old->state == MODULE_STATE_COMING - || old->state == MODULE_STATE_UNFORMED) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_module; - goto again; - } - err = -EEXIST; - mutex_unlock(&module_mutex); + /* Reserve our place in the list. */ + err = add_unformed_module(mod); + if (err) goto free_module; - } - list_add_rcu(&mod->list, &modules); - mutex_unlock(&module_mutex); #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; - if (!mod->sig_ok) - add_taint_module(mod, TAINT_FORCED_MODULE); + if (!mod->sig_ok) { + printk_once(KERN_NOTICE + "%s: module verification failed: signature and/or" + " required key missing - tainting kernel\n", + mod->name); + add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + } #endif /* Now module is in final location, initialize linked lists, etc. */ @@ -3236,21 +3290,11 @@ again: dynamic_debug_setup(info->debug, info->num_debug); - mutex_lock(&module_mutex); - /* Find duplicate symbols (must be called under lock). */ - err = verify_export_symbols(mod); - if (err < 0) + /* Finally it's fully formed, ready to start executing. */ + err = complete_formation(mod, info); + if (err) goto ddebug_cleanup; - /* This relies on module_mutex for list integrity. */ - module_bug_finalize(info->hdr, info->sechdrs, mod); - - /* Mark state as coming so strong_try_module_get() ignores us, - * but kallsyms etc. can see us. */ - mod->state = MODULE_STATE_COMING; - - mutex_unlock(&module_mutex); - /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); @@ -3274,8 +3318,8 @@ again: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); - ddebug_cleanup: mutex_unlock(&module_mutex); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); diff --git a/kernel/mutex.c b/kernel/mutex.c index a307cc9c9526..ad53a664f113 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -19,6 +19,7 @@ */ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -36,6 +37,12 @@ # include <asm/mutex.h> #endif +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -43,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + lock->spin_mlock = NULL; +#endif debug_mutex_init(lock, name, key); } @@ -94,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { + struct mspin_node *next ; + int locked; /* 1 if lock acquired */ +}; +#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + smp_wmb(); + /* Wait until the lock holder passes the lock down */ + while (!ACCESS_ONCE(node->locked)) + arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + ACCESS_ONCE(next->locked) = 1; + smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ + int retval = 1; + + rcu_read_lock(); + if (lock->owner) + retval = lock->owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** @@ -157,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. */ + if (!mutex_can_spin_on_owner(lock)) + goto slowpath; for (;;) { struct task_struct *owner; + struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ + mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) + if (owner && !mutex_spin_on_owner(lock, owner)) { + mspin_unlock(MLOCK(lock), &node); break; + } - if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + if ((atomic_read(&lock->count) == 1) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); + mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } + mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -194,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } +slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -204,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); @@ -219,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * that when we release the lock, we properly wake up the * other waiters: */ - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && + (atomic_xchg(&lock->count, -1) == 1)) break; /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 78e2ecb20165..364ceab15f0c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,7 +22,7 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -153,8 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - new_ns = create_new_namespaces(flags, tsk, - task_cred_xxx(tsk, user_ns), tsk->fs); + new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); goto out; @@ -242,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_inode *ei; + struct proc_ns *ei; struct file *file; int err; @@ -251,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file->f_dentry->d_inode); + ei = get_proc_ns(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; diff --git a/kernel/panic.c b/kernel/panic.c index e1b2822fff97..167ec097ce8b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -22,7 +22,6 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> -#include <linux/dmi.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -259,26 +258,19 @@ unsigned long get_taint(void) return tainted_mask; } -void add_taint(unsigned flag) +/** + * add_taint: add a taint flag if not already set. + * @flag: one of the TAINT_* constants. + * @lockdep_ok: whether lock debugging is still OK. + * + * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for + * some notewortht-but-not-corrupting cases, it can be set to true. + */ +void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { - /* - * Can't trust the integrity of the kernel anymore. - * We don't call directly debug_locks_off() because the issue - * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging/out-of-tree - * development and post-warning case. - */ - switch (flag) { - case TAINT_CRAP: - case TAINT_OOT_MODULE: - case TAINT_WARN: - case TAINT_FIRMWARE_WORKAROUND: - break; - - default: - if (__debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); - } + if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) + printk(KERN_WARNING + "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -407,13 +399,8 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - const char *board; - printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (board) - printk(KERN_WARNING "Hardware name: %s\n", board); if (args) vprintk(args->fmt, args->args); @@ -421,7 +408,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller, print_modules(); dump_stack(); print_oops_end_marker(); - add_taint(taint); + /* Just a warning, don't kill lockdep. */ + add_taint(taint, LOCKDEP_STILL_OK); } void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) diff --git a/kernel/params.c b/kernel/params.c index ed35345be536..53b958fcd639 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), GFP_KERNEL); if (!new) { - kfree(mk->mp); + kfree(attrs); err = -ENOMEM; goto fail; } + /* Despite looking like the typical realloc() bug, this is safe. + * We *want* the old 'attrs' to be freed either way, and we'll store + * the new one in the success case. */ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); if (!attrs) { err = -ENOMEM; diff --git a/kernel/pid.c b/kernel/pid.c index f2c6a6825098..0db3e791a06d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> +#include <linux/proc_ns.h> #include <linux/proc_fs.h> #define pid_hashfn(nr, ns) \ @@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - static inline int mk_pid(struct pid_namespace *pid_ns, struct pidmap *map, int off) { @@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) break; } if (likely(atomic_read(&map->nr_free))) { - do { + for ( ; ; ) { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); + if (offset >= BITS_PER_PAGE) + break; pid = mk_pid(pid_ns, map, offset); - } while (offset < BITS_PER_PAGE && pid < pid_max); + if (pid >= pid_max) + break; + } } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; @@ -350,10 +352,9 @@ void disable_pid_allocation(struct pid_namespace *ns) struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct hlist_node *elem; struct upid *pnr; - hlist_for_each_entry_rcu(pnr, elem, + hlist_for_each_entry_rcu(pnr, &pid_hash[pid_hashfn(nr, ns)], pid_chain) if (pnr->nr == nr && pnr->ns == ns) return container_of(pnr, struct pid, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..6917e8edb48e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,12 +15,10 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> -#define BITS_PER_PAGE (PAGE_SIZE*8) - struct pid_cache { int nr_ids; char name[16]; @@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index a278cad1d5d6..42670e9b44e0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -10,6 +10,8 @@ #include <linux/kernel_stat.h> #include <trace/events/timer.h> #include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h> /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -153,13 +155,36 @@ static void bump_cpu_timer(struct k_itimer *timer, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + static inline cputime_t prof_ticks(struct task_struct *p) { - return p->utime + p->stime; + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return utime + stime; } static inline cputime_t virt_ticks(struct task_struct *p) { - return p->utime; + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return utime; } static int @@ -471,18 +496,23 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { + cputime_t utime, stime; + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->cpu_timers, - tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + utime, stime, tsk->se.sum_exec_runtime); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct signal_struct *const sig = tsk->signal; + cputime_t utime, stime; + task_cputime(tsk, &utime, &stime); cleanup_timers(tsk->signal->cpu_timers, - tsk->utime + sig->utime, tsk->stime + sig->stime, + utime + sig->utime, stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -623,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return false; + + if (tsk->signal->cputimer.running) + return false; + + return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -781,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } + if (!ret) + posix_cpu_timer_kick_nohz(); return ret; } @@ -995,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1226,11 +1274,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample, static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { - .utime = tsk->utime, - .stime = tsk->stime, + .utime = utime, + .stime = stime, .sum_exec_runtime = tsk->se.sum_exec_runtime }; @@ -1320,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + + /* + * In case some timers were rescheduled after the queue got emptied, + * wake up full dynticks CPUs. + */ + if (tsk->signal->cputimer.running) + posix_cpu_timer_kick_nohz(); } /* @@ -1350,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - return; + goto out; *newval += now.cpu; } @@ -1368,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } +out: + posix_cpu_timer_kick_nohz(); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, @@ -1401,8 +1461,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, while (!signal_pending(current)) { if (timer.it.cpu.expires.sched == 0) { /* - * Our timer fired and was reset. + * Our timer fired and was reset, below + * deletion can not fail. */ + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return 0; } @@ -1420,9 +1482,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); - posix_cpu_timer_set(&timer, 0, &zero_it, it); + error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } spin_unlock_irq(&timer.it_lock); + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { /* * It actually did fire already. diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 69185ae6b701..424c2d4265c9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -40,38 +40,31 @@ #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/idr.h> +#include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> +#include <linux/hashtable.h> /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id <id> - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to <ptr> - * void idr_remove(struct idr *idp, int id); to release <id> - * void idr_init(struct idr *idp); to initialize <idp> - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. */ /* * Lets keep our timers in a slab cache :-) */ static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); __timr; \ }) +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); @@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) @@ -261,6 +309,16 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_boottime, @@ -278,11 +336,11 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); - idr_init(&posix_timers_id); return 0; } @@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -552,22 +610,9 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - retry: - if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { - error = -EAGAIN; - goto out; - } - spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); - spin_unlock_irq(&idr_lock); - if (error) { - if (error == -EAGAIN) - goto retry; - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - error = -EAGAIN; + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; goto out; } @@ -639,8 +684,15 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); + timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { @@ -997,7 +1049,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, err = kc->clock_adj(which_clock, &ktx); - if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; return err; diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index ca304046d9e2..c6422ffeda9a 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend); void queue_up_suspend_work(void) { - if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + if (autosleep_state > PM_SUSPEND_ON) queue_work(autosleep_wq, &suspend_work); } diff --git a/kernel/power/console.c b/kernel/power/console.c index b1dc456474b5..463aa6736751 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -4,6 +4,7 @@ * Originally from swsusp. */ +#include <linux/console.h> #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/vt.h> @@ -14,8 +15,120 @@ static int orig_fgconsole, orig_kmsg; +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) +{ + struct pm_vt_switch *entry, *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + break; + } + } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); + +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; + + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; + + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; + } + + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + int pm_prepare_console(void) { + if (!pm_vt_switch()) + return 0; + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) return 1; @@ -26,6 +139,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { + if (!pm_vt_switch()) + return; + if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c16f9167de1..d77663bfedeb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND - suspend_state_t state = PM_SUSPEND_STANDBY; + suspend_state_t state = PM_SUSPEND_MIN; const char * const *s; #endif char *p; @@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ +#ifdef CONFIG_FREEZER +static ssize_t pm_freeze_timeout_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", freeze_timeout_msecs); +} + +static ssize_t pm_freeze_timeout_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + freeze_timeout_msecs = val; + return n; +} + +power_attr(pm_freeze_timeout); + +#endif /* CONFIG_FREEZER*/ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -576,6 +600,9 @@ static struct attribute * g[] = { &pm_print_times_attr.attr, #endif #endif +#ifdef CONFIG_FREEZER + &pm_freeze_timeout_attr.attr, +#endif NULL, }; diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 68197a4e8fc9..7ef6866b521d 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -32,7 +32,7 @@ static void handle_poweroff(int key) static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, - .help_msg = "powerOff", + .help_msg = "poweroff(o)", .action_msg = "Power Off", .enable_mask = SYSRQ_ENABLE_BOOT, }; diff --git a/kernel/power/process.c b/kernel/power/process.c index d5a258b60c6f..98088e0e71e8 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -21,7 +21,7 @@ /* * Timeout for stopping processes */ -#define TIMEOUT (20 * HZ) +unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { @@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only) do_gettimeofday(&start); - end_time = jiffies + TIMEOUT; + end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9322ff7eaad6..587dddeebf15 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value, "%s called for unknown object.", __func__)) return; - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); if (new_value != req->node.prio) pm_qos_update_target( @@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req) return; } - if (delayed_work_pending(&req->work)) - cancel_delayed_work_sync(&req->work); + cancel_delayed_work_sync(&req->work); pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, &req->node, PM_QOS_REMOVE_REQ, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..bef86d121eb2 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,12 +30,38 @@ #include "power.h" const char *const pm_states[PM_SUSPEND_MAX] = { + [PM_SUSPEND_FREEZE] = "freeze", [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", }; static const struct platform_suspend_ops *suspend_ops; +static bool need_suspend_ops(suspend_state_t state) +{ + return !!(state > PM_SUSPEND_FREEZE); +} + +static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); +static bool suspend_freeze_wake; + +static void freeze_begin(void) +{ + suspend_freeze_wake = false; +} + +static void freeze_enter(void) +{ + wait_event(suspend_freeze_wait_head, suspend_freeze_wake); +} + +void freeze_wake(void) +{ + suspend_freeze_wake = true; + wake_up(&suspend_freeze_wait_head); +} +EXPORT_SYMBOL_GPL(freeze_wake); + /** * suspend_set_ops - Set the global suspend method table. * @ops: Suspend operations to use. @@ -50,8 +76,23 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && + pm_test_level != TEST_FREEZER && + pm_test_level != TEST_DEVICES && + pm_test_level != TEST_PLATFORM) { + printk(KERN_WARNING "Unsupported pm_test mode for " + "freeze state, please choose " + "none/freezer/devices/platform.\n"); + return false; + } +#endif + return true; + } /* - * All states need lowlevel support and need to be valid to the lowlevel + * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel + * support and need to be valid to the lowlevel * implementation, no valid callback implies that none are valid. */ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); @@ -89,11 +130,11 @@ static int suspend_test(int level) * hibernation). Run suspend notifiers, allocate the "suspend" console and * freeze processes. */ -static int suspend_prepare(void) +static int suspend_prepare(suspend_state_t state) { int error; - if (!suspend_ops || !suspend_ops->enter) + if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) return -EPERM; pm_prepare_console(); @@ -137,7 +178,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) { int error; - if (suspend_ops->prepare) { + if (need_suspend_ops(state) && suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) goto Platform_finish; @@ -149,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - if (suspend_ops->prepare_late) { + if (need_suspend_ops(state) && suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) goto Platform_wake; @@ -158,6 +199,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; + /* + * PM_SUSPEND_FREEZE equals + * frozen processes + suspended devices + idle processors. + * Thus we should invoke freeze_enter() soon after + * all the devices are suspended. + */ + if (state == PM_SUSPEND_FREEZE) { + freeze_enter(); + goto Platform_wake; + } + error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -182,13 +234,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - if (suspend_ops->wake) + if (need_suspend_ops(state) && suspend_ops->wake) suspend_ops->wake(); dpm_resume_start(PMSG_RESUME); Platform_finish: - if (suspend_ops->finish) + if (need_suspend_ops(state) && suspend_ops->finish) suspend_ops->finish(); return error; @@ -203,11 +255,11 @@ int suspend_devices_and_enter(suspend_state_t state) int error; bool wakeup = false; - if (!suspend_ops) + if (need_suspend_ops(state) && !suspend_ops) return -ENOSYS; trace_machine_suspend(state); - if (suspend_ops->begin) { + if (need_suspend_ops(state) && suspend_ops->begin) { error = suspend_ops->begin(state); if (error) goto Close; @@ -226,7 +278,7 @@ int suspend_devices_and_enter(suspend_state_t state) do { error = suspend_enter(state, &wakeup); - } while (!error && !wakeup + } while (!error && !wakeup && need_suspend_ops(state) && suspend_ops->suspend_again && suspend_ops->suspend_again()); Resume_devices: @@ -236,13 +288,13 @@ int suspend_devices_and_enter(suspend_state_t state) ftrace_start(); resume_console(); Close: - if (suspend_ops->end) + if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: - if (suspend_ops->recover) + if (need_suspend_ops(state) && suspend_ops->recover) suspend_ops->recover(); goto Resume_devices; } @@ -278,12 +330,15 @@ static int enter_state(suspend_state_t state) if (!mutex_trylock(&pm_mutex)) return -EBUSY; + if (state == PM_SUSPEND_FREEZE) + freeze_begin(); + printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - error = suspend_prepare(); + error = suspend_prepare(state); if (error) goto Unlock; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 25596e450ac7..9b2a1d58558d 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) rtc_set_alarm(rtc, &alm); } -static int __init has_wakealarm(struct device *dev, void *name_ptr) +static int __init has_wakealarm(struct device *dev, const void *data) { struct rtc_device *candidate = to_rtc_device(dev); @@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(const char **)name_ptr = dev_name(dev); return 1; } @@ -159,8 +158,8 @@ static int __init test_suspend(void) static char warn_no_rtc[] __initdata = KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; - char *pony = NULL; struct rtc_device *rtc = NULL; + struct device *dev; /* PM is initialized by now; is that state testable? */ if (test_state == PM_SUSPEND_ON) @@ -171,9 +170,9 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - class_find_device(rtc_class, NULL, &pony, has_wakealarm); - if (pony) - rtc = rtc_class_open(pony); + dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + if (dev) + rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); goto done; diff --git a/kernel/printk.c b/kernel/printk.c index 267ce780abe8..fa36e1494420 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> +#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -42,19 +43,14 @@ #include <linux/notifier.h> #include <linux/rculist.h> #include <linux/poll.h> +#include <linux/irq_work.h> +#include <linux/utsname.h> #include <asm/uaccess.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -62,8 +58,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -87,6 +81,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_lock_dep_map = { + .name = "console_lock" +}; +#endif + /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -217,6 +217,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -602,7 +603,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; - ret = POLLIN|POLLRDNORM; + else + ret = POLLIN|POLLRDNORM; } raw_spin_unlock_irq(&logbuf_lock); @@ -1259,7 +1261,7 @@ static void call_console_drivers(int level, const char *text, size_t len) { struct console *con; - trace_console(text, 0, len, len); + trace_console(text, len); if (level >= console_loglevel && !ignore_loglevel) return; @@ -1717,6 +1719,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) { @@ -1918,6 +1943,7 @@ void console_lock(void) return; console_locked = 1; console_may_schedule = 1; + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -1939,6 +1965,7 @@ int console_trylock(void) } console_locked = 1; console_may_schedule = 0; + mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -1948,43 +1975,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -void printk_tick(void) -{ - if (__this_cpu_read(printk_pending)) { - int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); - } -} - -int printk_needs_cpu(int cpu) -{ - if (cpu_is_offline(cpu)) - printk_tick(); - return __this_cpu_read(printk_pending); -} - -void wake_up_klogd(void) -{ - if (waitqueue_active(&log_wait)) - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2099,6 +2089,7 @@ skip: local_irq_restore(flags); } console_locked = 0; + mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2446,6 +2437,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { @@ -2462,6 +2491,7 @@ int printk_sched(const char *fmt, ...) va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); local_irq_restore(flags); return r; @@ -2821,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +static char dump_stack_arch_desc_str[128]; + +/** + * dump_stack_set_arch_desc - set arch-specific str to show with task dumps + * @fmt: printf-style format string + * @...: arguments for the format string + * + * The configured string will be printed right after utsname during task + * dumps. Usually used to add arch-specific system identifiers. If an + * arch wants to make use of such an ID string, it should initialize this + * as soon as possible during boot. + */ +void __init dump_stack_set_arch_desc(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), + fmt, args); + va_end(args); +} + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + if (dump_stack_arch_desc_str[0] != '\0') + printk("%sHardware name: %s\n", + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); +} + +/** + * show_regs_print_info - print generic debug info for show_regs() + * @log_lvl: log level + * + * show_regs() implementations can use this function to print out generic + * debug information. + */ +void show_regs_print_info(const char *log_lvl) +{ + dump_stack_print_info(log_lvl); + + printk("%stask: %p ti: %p task.ti: %p\n", + log_lvl, current, current_thread_info(), + task_thread_info(current)); +} + #endif diff --git a/kernel/profile.c b/kernel/profile.c index 1f391819c42f..0bf400737660 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -37,9 +37,6 @@ struct profile_hit { #define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) -/* Oprofile timer tick hook */ -static int (*timer_hook)(struct pt_regs *) __read_mostly; - static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; @@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -int register_timer_hook(int (*hook)(struct pt_regs *)) -{ - if (timer_hook) - return -EBUSY; - timer_hook = hook; - return 0; -} -EXPORT_SYMBOL_GPL(register_timer_hook); - -void unregister_timer_hook(int (*hook)(struct pt_regs *)) -{ - WARN_ON(hook != timer_hook); - timer_hook = NULL; - /* make sure all CPUs see the NULL hook */ - synchronize_sched(); /* Allow ongoing interrupts to complete. */ -} -EXPORT_SYMBOL_GPL(unregister_timer_hook); - - #ifdef CONFIG_SMP /* * Each cpu has a pair of open-addressed hashtables for pending @@ -436,8 +414,6 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (type == CPU_PROFILING && timer_hook) - timer_hook(regs); if (!user_mode(regs) && prof_cpu_mask != NULL && cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) profile_hit(type, (void *)profile_pc(regs)); @@ -486,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = { .write = prof_cpu_mask_proc_write, }; -void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) +void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ - proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); + proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops); } /* @@ -624,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ NULL, &proc_profile_operations); if (!entry) return 0; - entry->size = (1+prof_len) * sizeof(atomic_t); + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); hotcpu_notifier(profile_cpu_callback, 0); return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6cbeaae4406d..aed981a3f69c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> @@ -24,6 +25,7 @@ #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> +#include <linux/compat.h> static int ptrace_trapping_sleep_fn(void *flags) @@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) return error; } +static int ptrace_peek_siginfo(struct task_struct *child, + unsigned long addr, + unsigned long data) +{ + struct ptrace_peeksiginfo_args arg; + struct sigpending *pending; + struct sigqueue *q; + int ret, i; + + ret = copy_from_user(&arg, (void __user *) addr, + sizeof(struct ptrace_peeksiginfo_args)); + if (ret) + return -EFAULT; + + if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) + return -EINVAL; /* unknown flags */ + + if (arg.nr < 0) + return -EINVAL; + + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) + pending = &child->signal->shared_pending; + else + pending = &child->pending; + + for (i = 0; i < arg.nr; ) { + siginfo_t info; + s32 off = arg.off + i; + + spin_lock_irq(&child->sighand->siglock); + list_for_each_entry(q, &pending->list, list) { + if (!off--) { + copy_siginfo(&info, &q->info); + break; + } + } + spin_unlock_irq(&child->sighand->siglock); + + if (off >= 0) /* beyond the end of the list */ + break; + +#ifdef CONFIG_COMPAT + if (unlikely(is_compat_task())) { + compat_siginfo_t __user *uinfo = compat_ptr(data); + + ret = copy_siginfo_to_user32(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } else +#endif + { + siginfo_t __user *uinfo = (siginfo_t __user *) data; + + ret = copy_siginfo_to_user(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } + + if (ret) { + ret = -EFAULT; + break; + } + + data += sizeof(siginfo_t); + i++; + + if (signal_pending(current)) + break; + + cond_resched(); + } + + if (i > 0) + return i; + + return ret; +} #ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) @@ -712,6 +789,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, kiov->iov_len, kiov->iov_base); } +/* + * This is declared in linux/regset.h and defined in machine-dependent + * code. We put the export here, near the primary machine-neutral use, + * to ensure no machine forgets it. + */ +EXPORT_SYMBOL_GPL(task_user_regset_view); #endif int ptrace_request(struct task_struct *child, long request, @@ -742,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request, ret = put_user(child->ptrace_message, datalp); break; + case PTRACE_PEEKSIGINFO: + ret = ptrace_peek_siginfo(child, addr, data); + break; + case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) diff --git a/kernel/range.c b/kernel/range.c index 9b8ae2d6ed68..071b0ab455cb 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) range[i].end = range[j].end; range[i].start = end; } else { - printk(KERN_ERR "run of slot in ranges\n"); + pr_err("%s: run out of slot in ranges\n", + __func__); } range[j].end = start; continue; diff --git a/kernel/rcu.h b/kernel/rcu.h index 20dfba576c2b..7f8e7590e3e5 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) extern int rcu_expedited; +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a2cf76177b44..48ab70384a4c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) +void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, unsigned long c) { - trace_rcu_torture_read(rcutorturename, rhp); + trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); } EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #else -#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) #endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index e7dce58f9c2a..a0714a51b6d7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); -#include "rcutiny_plugin.h" - static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; +#include "rcutiny_plugin.h" + /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long newval) { @@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * interrupts don't count, we must be running at the first interrupt * level. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return rcu_dynticks_nesting <= 1; } @@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void) */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { + reset_cpu_stall_ticks(rcp); if (rcp->rcucblist != NULL && rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; @@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu) */ void rcu_check_callbacks(int cpu, int user) { + check_cpu_stalls(); if (user || rcu_is_cpu_rrupt_from_idle()) rcu_sched_qs(cpu); else if (!in_softirq()) diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f85016a2309b..8a233002faeb 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -33,6 +33,9 @@ struct rcu_ctrlblk { struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ + RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ + RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ RCU_TRACE(char *name); /* Name of RCU type. */ }; @@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_RCU_TRACE + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stall_preempt(void); + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ +#ifdef CONFIG_RCU_TRACE + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +#endif /* #ifdef CONFIG_RCU_TRACE */ +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall_preempt()); +} + #ifdef CONFIG_TINY_PREEMPT_RCU #include <linux/delay.h> @@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void) /* Official start of GP. */ rcu_preempt_ctrlblk.gpnum++; RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); + reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb); /* Any blocked RCU readers block new GP. */ if (rcu_preempt_blocked_readers_any()) @@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney"); MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); MODULE_LICENSE("GPL"); +static void check_cpu_stall_preempt(void) +{ +#ifdef CONFIG_TINY_PREEMPT_RCU + check_cpu_stall(&rcu_preempt_ctrlblk.rcb); +#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */ +} + #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 31dea01c85fd..e1f3a8c96724 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -46,6 +46,7 @@ #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/trace_clock.h> #include <asm/byteorder.h> MODULE_LICENSE("GPL"); @@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); #define rcu_can_boost() 0 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ @@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_uninterruptible(1); + schedule_timeout_interruptible(oldstarttime - jiffies); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) @@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void) return; if (atomic_xchg(&beenhere, 1) != 0) return; - do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); ftrace_dump(DUMP_ALL); } @@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused) { int idx; int completed; + int completed_end; static DEFINE_RCU_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; + unsigned long long ts; idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused) cur_ops->readunlock(idx); return; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1094,11 +1114,13 @@ static int rcu_torture_reader(void *arg) { int completed; + int completed_end; int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; + unsigned long long ts; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); @@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg) } idx = cur_ops->readlock(); completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || @@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg) schedule_timeout_interruptible(HZ); continue; } - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); @@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - if (pipe_count > 1) + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); rcutorture_trace_dump(); + } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = cur_ops->completed() - completed; + completed = completed_end - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; @@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed_ptr(reader_tasks[i], shuffle_tmp_mask); } - if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed_ptr(fakewriter_tasks[i], shuffle_tmp_mask); } - if (writer_task) set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; @@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void) barrier_cbs_wq = kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), GFP_KERNEL); - if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0) + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..16ea67925015 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ + .abbr = sabbr, \ } struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -105,7 +106,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimized synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_sched() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -217,18 +218,14 @@ module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); - static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -305,17 +302,29 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) } /* - * Does the current CPU require a yet-as-unscheduled grace period? + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry. */ static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - struct rcu_head **ntp; + int i; - ntp = rdp->nxttail[RCU_DONE_TAIL + - (ACCESS_ONCE(rsp->completed) != rdp->completed)]; - return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && - !rcu_gp_in_progress(rsp); + if (rcu_gp_in_progress(rsp)) + return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rsp)) + return 1; /* Yes, a no-CBs CPU needs one. */ + if (!rdp->nxttail[RCU_NEXT_TAIL]) + return 0; /* No, this is a no-CBs (or offline) CPU. */ + if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + return 1; /* Yes, this CPU has newly registered callbacks. */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rdp->nxttail[i - 1] != rdp->nxttail[i] && + ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + rdp->nxtcompleted[i])) + return 1; /* Yes, CBs for future grace period. */ + return 0; /* No grace period needed. */ } /* @@ -336,7 +345,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, 0); + trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); @@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); * interrupt from idle, return true. The caller must have at least * disabled preemption. */ -int rcu_is_cpu_rrupt_from_idle(void) +static int rcu_is_cpu_rrupt_from_idle(void) { return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } @@ -790,31 +799,23 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->offline_fqs++; return 1; } - return 0; -} - -static int jiffies_till_stall_check(void) -{ - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. + * There is a possibility that a CPU in adaptive-ticks state + * might run in the kernel with the scheduling-clock tick disabled + * for an extended time period. Invoke rcu_kick_nohz_cpu() to + * force the CPU to restart the scheduling-clock tick in this + * CPU is in this state. */ - if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; - till_stall_check = 3; - } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; + rcu_kick_nohz_cpu(rdp->cpu); + + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); + rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); } /* @@ -857,7 +858,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; + rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -935,7 +936,7 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) rsp->jiffies_stall = jiffies + - 3 * jiffies_till_stall_check() + 3; + 3 * rcu_jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ @@ -966,12 +967,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) } } -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - /** * rcu_cpu_stall_reset - prevent further stall warnings in current grace period * @@ -989,15 +984,6 @@ void rcu_cpu_stall_reset(void) rsp->jiffies_stall = jiffies + ULONG_MAX / 2; } -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static void __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -} - /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice @@ -1064,10 +1050,266 @@ static void init_callback_list(struct rcu_data *rdp) { int i; + if (init_nocb_callback_list(rdp)) + return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - init_nocb_callback_list(rdp); +} + +/* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period. This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + /* + * If RCU is idle, we just wait for the next grace period. + * But we can only be sure that RCU is idle if we are looking + * at the root rcu_node structure -- otherwise, a new grace + * period might have started, but just not yet gotten around + * to initializing the current non-root rcu_node structure. + */ + if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) + return rnp->completed + 1; + + /* + * Otherwise, wait for a possible partial grace period and + * then the subsequent full grace period. + */ + return rnp->completed + 2; +} + +/* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + } else { + trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + return needmore; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned. Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure. This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + int i; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Starting from the sublist containing the callbacks most + * recently assigned a ->completed number and working down, find the + * first sublist that is not assignable to an upcoming grace period. + * Such a sublist has something in it (first two tests) and has + * a ->completed number assigned that will complete sooner than + * the ->completed number for newly arrived callbacks (last test). + * + * The key point is that any later sublist can be assigned the + * same ->completed number as the newly arrived callbacks, which + * means that the callbacks in any of these later sublist can be + * grouped into a single sublist, whether or not they have already + * been assigned a ->completed number. + */ + c = rcu_cbs_completed(rsp, rnp); + for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] != rdp->nxttail[i - 1] && + !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) + break; + + /* + * If there are no sublist for unassigned callbacks, leave. + * At the same time, advance "i" one sublist, so that "i" will + * index into the sublist where all the remaining callbacks should + * be grouped into. + */ + if (++i >= RCU_NEXT_TAIL) + return; + + /* + * Assign all subsequent callbacks' ->completed number to the next + * full grace period and group them all in the sublist initially + * indexed by "i". + */ + for (; i <= RCU_NEXT_TAIL; i++) { + rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtcompleted[i] = c; + } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); + + /* Trace depending on how much we were able to accelerate. */ + if (!*rdp->nxttail[RCU_WAIT_TAIL]) + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + else + trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist. This function is idempotent, so it does not hurt to + * invoke it repeatedly. As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i, j; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Find all callbacks whose ->completed numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) + break; + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; + } + /* Clean up any sublist tail pointers that were misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + + /* Copy down callbacks to fill in empty sublists. */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) + break; + rdp->nxttail[j] = rdp->nxttail[i]; + rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; + } + + /* Classify any remaining callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); } /* @@ -1080,12 +1322,15 @@ static void __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* Did another grace period end? */ - if (rdp->completed != rnp->completed) { + if (rdp->completed == rnp->completed) { + + /* No, so just accelerate recent callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + + } else { - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + /* Advance callbacks. */ + rcu_advance_cbs(rsp, rnp, rdp); /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; @@ -1195,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; WARN_ON_ONCE(rnp->completed != rsp->completed); - rnp->completed = rsp->completed; + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1206,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0) + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); @@ -1248,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1277,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; + ACCESS_ONCE(rnp->completed) = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __rcu_process_gp_end(rsp, rnp, rdp); + nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); @@ -1363,64 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg) /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled. * * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period * task, this CPU does not need another grace period, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - - /* - * Because there is no grace period in progress right now, - * any callbacks we have up to this point will be satisfied - * by the next grace period. So promote all callbacks to be - * handled after the end of the next grace period. If the - * CPU is not yet aware of the end of the previous grace period, - * we need to allow for the callback advancement that will - * occur when it does become aware. Deadlock prevents us from - * making it aware at this point: We cannot acquire a leaf - * rcu_node ->lock while holding the root rcu_node ->lock. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - if (rdp->completed == rsp->completed) - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ - - /* Ensure that CPU is aware of completion of last grace period. */ - rcu_process_gp_end(rsp, rdp); - local_irq_restore(flags); /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } /* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) @@ -1527,7 +1778,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rcu_accelerate_cbs(rsp, rnp, rdp); rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } @@ -1579,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (is_nocb_cpu(rdp->cpu)) + if (rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -1779,7 +2030,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) long bl, count, count_lazy; int i; - /* If no callbacks are ready, just return.*/ + /* If no callbacks are ready, just return. */ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), @@ -2008,19 +2259,20 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * Advance callbacks in response to end of earlier grace - * period that some other CPU ended. - */ + /* Handle the end of a grace period that some other CPU ended. */ rcu_process_gp_end(rsp, rdp); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); /* Does this CPU require a not-yet-started grace period? */ + local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); - rcu_start_gp(rsp, flags); /* releases above lock */ + raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + } else { + local_irq_restore(flags); } /* If there are callbacks ready, invoke them. */ @@ -2063,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void invoke_rcu_core(void) { - raise_softirq(RCU_SOFTIRQ); + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } /* @@ -2098,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -2522,19 +2775,27 @@ static int rcu_pending(int cpu) } /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { + bool al = true; + bool hc = false; + struct rcu_data *rdp; struct rcu_state *rsp; - /* RCU callbacks either ready or pending? */ - for_each_rcu_flavor(rsp) - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) - return 1; - return 0; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->qlen != rdp->qlen_lazy) + al = false; + if (rdp->nxtlist) + hc = true; + } + if (all_lazy) + *all_lazy = al; + return hc; } /* @@ -2641,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (is_nocb_cpu(cpu)) { + if (rcu_is_nocb_cpu(cpu)) { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); @@ -2719,9 +2980,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); -#ifdef CONFIG_RCU_USER_QS - WARN_ON_ONCE(rdp->dynticks->in_user); -#endif rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); @@ -2756,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* Add CPU to rcu_node bitmasks. */ @@ -2806,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2820,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - if (nocb_cpu_expendable(cpu)) - rcu_boost_kthread_setaffinity(rnp, cpu); - else - ret = NOTIFY_BAD; + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -2847,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return ret; + return NOTIFY_OK; } /* @@ -2938,6 +3185,10 @@ static void __init rcu_init_one(struct rcu_state *rsp, BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + /* Silence gcc 4.8 warning about array index out of range. */ + if (rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls overflow"); + /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) @@ -2978,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); } } @@ -3063,8 +3315,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - rcu_init_nocb(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because @@ -3074,7 +3325,6 @@ void __init rcu_init(void) cpu_notifier(rcu_cpu_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - check_cpu_stall_init(); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4b69291b093d..da77a8f57ff9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,24 +88,15 @@ struct rcu_dynticks { int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ #ifdef CONFIG_RCU_FAST_NO_HZ - int dyntick_drain; /* Prepare-for-idle state variable. */ - unsigned long dyntick_holdoff; - /* No retries for the jiffy of failure. */ - struct timer_list idle_gp_timer; - /* Wake up CPU sleeping with callbacks. */ - unsigned long idle_gp_timer_expires; - /* When to wake up CPU (for repost). */ - bool idle_first_pass; /* First pass of attempt to go idle? */ + bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -#ifdef CONFIG_RCU_USER_QS - bool ignore_user_qs; /* Treat userspace as extended QS or not */ - bool in_user; /* Is the CPU in userland from RCU POV? */ -#endif }; /* RCU's kthread states for tracing. */ @@ -138,9 +129,6 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ - atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ - /* Since this has meaning only for leaf */ - /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -200,6 +188,12 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU + wait_queue_head_t nocb_gp_wq[2]; + /* Place for rcu_nocb_kthread() to wait GP. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int need_future_gp[2]; + /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; @@ -282,6 +276,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; + unsigned long nxtcompleted[RCU_NEXT_SIZE]; + /* grace periods for sublists. */ long qlen_lazy; /* # of lazy queued callbacks */ long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; @@ -330,6 +326,11 @@ struct rcu_data { struct task_struct *nocb_kthread; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned int softirq_snap; /* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + int cpu; struct rcu_state *rsp; }; @@ -343,11 +344,6 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ @@ -382,12 +378,6 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); -#ifdef CONFIG_RCU_NOCB_CPU - void (*call_remote)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - /* call_rcu() flavor, but for */ - /* placing on remote CPU. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -450,6 +440,7 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ }; @@ -527,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); @@ -536,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static bool is_nocb_cpu(int cpu); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp); -static bool nocb_cpu_expendable(int cpu); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void init_nocb_callback_list(struct rcu_data *rdp); -static void __init rcu_init_nocb(void); +static void rcu_kick_nohz_cpu(int cpu); +static bool init_nocb_callback_list(struct rcu_data *rdp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..170814dc418f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,6 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> +#include <linux/tick.h> #define RCU_KTHREAD_PRIO 1 @@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void) if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE + if (!have_rcu_nocb_mask) { + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + } +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tExperimental no-CBs CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tExperimental no-CBs for all CPUs\n"); + cpumask_setall(rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { - if (cpumask_test_cpu(0, rcu_nocb_mask)) { - cpumask_clear_cpu(0, rcu_nocb_mask); - pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); - } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); + RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ + return rcu_cpu_has_callbacks(cpu, NULL); } /* @@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void) * * The following three proprocessor symbols control this state machine: * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - * to satisfy RCU. Beyond this point, it is better to incur a periodic - * scheduling-clock interrupt than to loop through the state machine - * at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - * optional if RCU does not need anything immediately from this - * CPU, even if this CPU still has RCU callbacks queued. The first - * times through the state machine are mandatory: we need to give - * the state machine a chance to communicate a quiescent state - * to the RCU core. * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This * is sized to be roughly one RCU grace period. Those energy-efficiency @@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void) * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -extern int tick_nohz_enabled; - -/* - * Does the specified flavor of RCU have non-lazy callbacks pending on - * the specified CPU? Both RCU flavor and CPU are specified by the - * rcu_data structure. - */ -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) -{ - return rdp->qlen != rdp->qlen_lazy; -} +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); -#ifdef CONFIG_TREE_PREEMPT_RCU +extern int tick_nohz_enabled; /* - * Are there non-lazy RCU-preempt callbacks? (There cannot be if there - * is no RCU-preempt in the kernel.) + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true. */ -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +static bool rcu_try_advance_all_cbs(void) { - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - return __rcu_cpu_has_nonlazy_callbacks(rdp); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + bool cbs_ready = false; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) -{ - return 0; -} + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if (rdp->completed != rnp->completed && + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_process_gp_end(rsp, rdp); -/* - * Does any flavor of RCU have non-lazy callbacks on the specified CPU? - */ -static bool rcu_cpu_has_nonlazy_callbacks(int cpu) -{ - return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || - __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_cpu_has_nonlazy_callbacks(cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + cbs_ready = true; + } + return cbs_ready; } /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke. If the CPU has callbacks, try to advance them. Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks. * - * The delta_jiffies argument is used to store the time when RCU is - * going to need the CPU again if it still has callbacks. The reason - * for this is that rcu_prepare_for_idle() might need to post a timer, - * but if so, it will do so after tick_nohz_stop_sched_tick() has set - * the wakeup time for this CPU. This means that RCU's timer can be - * delayed until the wakeup time, which defeats the purpose of posting - * a timer. + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; + /* Snapshot to detect later posting of non-lazy callback. */ + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - *delta_jiffies = ULONG_MAX; + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + *dj = ULONG_MAX; return 0; } - if (rdtp->dyntick_holdoff == jiffies) { - /* RCU recently tried and failed, so don't try again. */ - *delta_jiffies = 1; + + /* Attempt to advance callbacks. */ + if (rcu_try_advance_all_cbs()) { + /* Some ready to invoke, so initiate later invocation. */ + invoke_rcu_core(); return 1; } - /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, - RCU_IDLE_GP_DELAY) - jiffies; + rdtp->last_accelerate = jiffies; + + /* Request timer delay depending on laziness, and round. */ + if (rdtp->all_lazy) { + *dj = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; } else { - *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; - *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } return 0; } /* - * Handler for smp_call_function_single(). The only point of this - * handler is to wake the CPU up, so the handler does only tracing. - */ -void rcu_idle_demigrate(void *unused) -{ - trace_rcu_prep_idle("Demigrate"); -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending. The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - * - * One special case: the timer gets migrated without awakening the CPU - * on which the timer was scheduled on. In this case, we must wake up - * that CPU. We do so with smp_call_function_single(). - */ -static void rcu_idle_gp_timer_func(unsigned long cpu_in) -{ - int cpu = (int)cpu_in; - - trace_rcu_prep_idle("Timer"); - if (cpu != smp_processor_id()) - smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); - else - WARN_ON_ONCE(1); /* Getting here can hang the system... */ -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dyntick_holdoff = jiffies - 1; - setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); - rdtp->idle_gp_timer_expires = jiffies - 1; - rdtp->idle_first_pass = 1; -} - -/* - * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to ->idle_gp_timer, so cancel it. This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - del_timer(&rdtp->idle_gp_timer); - trace_rcu_prep_idle("Cleanup after idle"); - rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); -} - -/* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done. This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. - * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode. In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. - * - * Because it is not legal to invoke rcu_process_callbacks() with irqs - * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The ->dyntick_drain field controls the sequencing. + * Prepare a CPU for idle from an RCU perspective. The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks. The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { - struct timer_list *tp; + struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_node *rnp; + struct rcu_state *rsp; int tne; /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_enabled); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu)) + if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; - /* Adaptive-tick mode, where usermode execution is idle to RCU. */ - if (!is_idle_task(current)) { - rdtp->dyntick_holdoff = jiffies - 1; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("User dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else if (rcu_cpu_has_callbacks(cpu)) { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("User dyntick with lazy callbacks"); - } else { - return; - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + /* If this is a no-CBs CPU, no callbacks, just return. */ + if (rcu_is_nocb_cpu(cpu)) return; - } /* - * If this is an idle re-entry, for example, due to use of - * RCU_NONIDLE() or the new idle-loop tracing API within the idle - * loop, then don't take any state-machine actions, unless the - * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the ->idle_gp_timer if this CPU has callbacks - * pending. + * If a non-lazy callback arrived at a CPU having only lazy + * callbacks, invoke RCU core for the side-effect of recalculating + * idle duration on re-entry to idle. */ - if (!rdtp->idle_first_pass && - (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { - if (rcu_cpu_has_callbacks(cpu)) { - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - } + if (rdtp->all_lazy && + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + invoke_rcu_core(); return; } - rdtp->idle_first_pass = 0; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* - * If there are no callbacks on this CPU, enter dyntick-idle mode. - * Also reset state to avoid prejudicing later attempts. + * If we have not yet accelerated this jiffy, accelerate all + * callbacks on this CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - rdtp->dyntick_holdoff = jiffies - 1; - rdtp->dyntick_drain = 0; - trace_rcu_prep_idle("No callbacks"); + if (rdtp->last_accelerate == jiffies) return; + rdtp->last_accelerate = jiffies; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!*rdp->nxttail[RCU_DONE_TAIL]) + continue; + rnp = rdp->mynode; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +} - /* - * If in holdoff mode, just return. We will presumably have - * refrained from disabling the scheduling-clock tick. - */ - if (rdtp->dyntick_holdoff == jiffies) { - trace_rcu_prep_idle("In holdoff"); - return; - } +/* + * Clean up for exit from idle. Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + struct rcu_data *rdp; + struct rcu_state *rsp; - /* Check and update the ->dyntick_drain sequencing. */ - if (rdtp->dyntick_drain <= 0) { - /* First time through, initialize the counter. */ - rdtp->dyntick_drain = RCU_IDLE_FLUSHES; - } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu) && - !local_softirq_pending()) { - /* Can we go dyntick-idle despite still having callbacks? */ - rdtp->dyntick_drain = 0; - rdtp->dyntick_holdoff = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("Dyntick with lazy callbacks"); - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - return; /* Nothing more to do immediately. */ - } else if (--(rdtp->dyntick_drain) <= 0) { - /* We have hit the limit, so time to give up. */ - rdtp->dyntick_holdoff = jiffies; - trace_rcu_prep_idle("Begin holdoff"); - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + if (rcu_is_nocb_cpu(cpu)) return; - } - - /* - * Do one step of pushing the remaining RCU callbacks through - * the RCU core state machine. - */ -#ifdef CONFIG_TREE_PREEMPT_RCU - if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state); - } -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - if (per_cpu(rcu_sched_data, cpu).nxtlist) { - rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state); - } - if (per_cpu(rcu_bh_data, cpu).nxtlist) { - rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state); - } - - /* - * If RCU callbacks are still pending, RCU still needs this CPU. - * So try forcing the callbacks through the grace period. - */ - if (rcu_cpu_has_callbacks(cpu)) { - trace_rcu_prep_idle("More callbacks"); - invoke_rcu_core(); - } else { - trace_rcu_prep_idle("Callbacks drained"); + rcu_try_advance_all_cbs(); + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); } } @@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier); static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - struct timer_list *tltp = &rdtp->idle_gp_timer; - char c; + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; - c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; - if (timer_pending(tltp)) - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, c, tltp->expires - jiffies); - else - sprintf(cp, "drain=%d %c timer not pending", - rdtp->dyntick_drain, c); + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + ulong2long(nlpd), + rdtp->all_lazy ? 'L' : '.', + rdtp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), fast_no_hz); } @@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void) static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); } /* Increment ->ticks_this_gp for all flavors of RCU. */ @@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); +} + +/* + * Set the root rcu_node structure's ->need_future_gp field + * based on the sum of those of all rcu_node structures. This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. + */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); +} + /* Is the specified CPU a no-CPUs CPU? */ -static bool is_nocb_cpu(int cpu) +bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) return cpumask_test_cpu(cpu, rcu_nocb_mask); @@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { - if (!is_nocb_cpu(rdp->cpu)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + if (__is_kfree_rcu_offset((unsigned long)rhp->func)) + trace_rcu_kfree_callback(rdp->rsp->name, rhp, + (unsigned long)rhp->func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rdp->rsp->name, rhp, + rdp->qlen_lazy, rdp->qlen); return 1; } @@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, long qll = rsp->qlen_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ - if (!is_nocb_cpu(smp_processor_id())) + if (!rcu_is_nocb_cpu(smp_processor_id())) return 0; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, } /* - * There must be at least one non-no-CBs CPU in operation at any given - * time, because no-CBs CPUs are not capable of initiating grace periods - * independently. This function therefore complains if the specified - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to - * avoid offlining the last such CPU. (Recursion is a wonderful thing, - * but you have to have a base case!) + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete. */ -static bool nocb_cpu_expendable(int cpu) +static void rcu_nocb_wait_gp(struct rcu_data *rdp) { - cpumask_var_t non_nocb_cpus; - int ret; + unsigned long c; + bool d; + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + c = rcu_start_future_gp(rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* - * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, - * then offlining this CPU is harmless. Let it happen. + * Wait for the grace period. Do so interruptibly to avoid messing + * up the load average. */ - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) - return 1; - - /* If no memory, play it safe and keep the CPU around. */ - if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) - return 0; - cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); - cpumask_clear_cpu(cpu, non_nocb_cpus); - ret = !cpumask_empty(non_nocb_cpus); - free_cpumask_var(non_nocb_cpus); - return ret; -} - -/* - * Helper structure for remote registry of RCU callbacks. - * This is needed for when a no-CBs CPU needs to start a grace period. - * If it just invokes call_rcu(), the resulting callback will be queued, - * which can result in deadlock. - */ -struct rcu_head_remote { - struct rcu_head *rhp; - call_rcu_func_t *crf; - void (*func)(struct rcu_head *rhp); -}; - -/* - * Register a callback as specified by the rcu_head_remote struct. - * This function is intended to be invoked via smp_call_function_single(). - */ -static void call_rcu_local(void *arg) -{ - struct rcu_head_remote *rhrp = - container_of(arg, struct rcu_head_remote, rhp); - - rhrp->crf(rhrp->rhp, rhrp->func); -} - -/* - * Set up an rcu_head_remote structure and the invoke call_rcu_local() - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via - * smp_call_function_single(). - */ -static void invoke_crf_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp), - call_rcu_func_t crf) -{ - struct rcu_head_remote rhr; - - rhr.rhp = rhp; - rhr.crf = crf; - rhr.func = func; - smp_call_function_single(0, call_rcu_local, &rhr, 1); -} - -/* - * Helper functions to be passed to wait_rcu_gp(), each of which - * invokes invoke_crf_remote() to register a callback appropriately. - */ -static void __maybe_unused -call_rcu_preempt_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu); -} -static void call_rcu_bh_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_bh); -} -static void call_rcu_sched_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_sched); + trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + for (;;) { + wait_event_interruptible( + rnp->nocb_gp_wq[c & 0x1], + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + if (likely(d)) + break; + flush_signals(current); + trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + } + trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + smp_mb(); /* Ensure that CB invocation happens after GP end. */ } /* @@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg) cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); ACCESS_ONCE(rdp->nocb_p_count) += c; ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - wait_rcu_gp(rdp->rsp->call_remote); + rcu_nocb_wait_gp(rdp); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) return; for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + t = kthread_run(rcu_nocb_kthread, rdp, + "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); ACCESS_ONCE(rdp->nocb_kthread) = t; } } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { if (rcu_nocb_mask == NULL || !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) - return; + return false; rdp->nxttail[RCU_NEXT_TAIL] = NULL; + return true; } -/* Initialize the ->call_remote fields in the rcu_state structures. */ -static void __init rcu_init_nocb(void) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_state *rsp) { -#ifdef CONFIG_PREEMPT_RCU - rcu_preempt_state.call_remote = call_rcu_preempt_remote; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - rcu_bh_state.call_remote = call_rcu_bh_remote; - rcu_sched_state.call_remote = call_rcu_sched_remote; + return 0; } -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ +} -static bool is_nocb_cpu(int cpu) +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) { - return false; } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, @@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, return 0; } -static bool nocb_cpu_expendable(int cpu) -{ - return 1; -} - static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } @@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { + return false; } -static void __init rcu_init_nocb(void) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off. RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled. Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) { +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(cpu)) + smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa670..cf6c17412932 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,8 +46,6 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -#define ulong2long(a) (*(long *)(&(a))) - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = { .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = { .open = rcuexp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = { .open = rcuhier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) @@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = { .open = rcugp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) diff --git a/kernel/relay.c b/kernel/relay.c index e8cd2027abbd..b91488ba2e5a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf) static void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - buf->chan->cb->remove_buf_file(buf->dentry); relay_destroy_buf(buf); } @@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; del_timer_sync(&buf->timer); + buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } @@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename, chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; - chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; if (base_filename) { @@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, static int subbuf_read_actor(size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor) + read_descriptor_t *desc) { void *from; int ret = 0; @@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start, typedef int (*subbuf_actor_t) (size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor); + read_descriptor_t *desc); /* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, subbuf_actor_t subbuf_actor, - read_actor_t actor, read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; @@ -1139,7 +1136,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_lock(&file_inode(filp)->i_mutex); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, break; avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc, actor); + ret = subbuf_actor(read_start, buf, avail, desc); if (desc->error < 0) break; @@ -1159,7 +1156,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + mutex_unlock(&file_inode(filp)->i_mutex); return desc->written; } @@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp, desc.count = count; desc.arg.buf = buffer; desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, - NULL, &desc); + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b9..d7386986e10e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> +#include <linux/mm.h> #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +783,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } @@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + free_resource(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + free_resource(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 16502d3a71c8..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -17,6 +17,7 @@ * See rt.c in preempt-rt for proper credits and further information */ #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/spinlock.h> diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 98ec49475460..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -10,9 +10,11 @@ #include <linux/kthread.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/stat.h> #include "rtmutex.h" @@ -365,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at return curr - buf; } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); static struct bus_type rttest_subsys = { .name = "rttest", diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a242e691c993..1e09308bf2a1 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/export.h> #include <linux/sched.h> +#include <linux/sched/rt.h> #include <linux/timer.h> #include "rtmutex_common.h" diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3fcd847..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..64de5f8b0c9e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref) ag->tg->rt_se = NULL; ag->tg->rt_rq = NULL; #endif + sched_offline_group(ag->tg); sched_destroy_group(ag->tg); } @@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; + sched_online_group(tg, &root_task_group); + kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 26058d0bebba..58453b8272fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -83,7 +83,7 @@ #endif #include "sched.h" -#include "../workqueue_sched.h" +#include "../workqueue_internal.h" #include "../smpboot.h" #define CREATE_TRACE_POINTS @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; @@ -549,7 +544,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -587,7 +582,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ void sched_avg_update(struct rq *rq) { @@ -1132,18 +1163,28 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; - /* Look for allowed, online CPU in same node. */ - for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; + /* + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. + */ + if (nid != -1) { + nodemask = cpumask_of_node(nid); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } } for (;;) { @@ -1278,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -1352,7 +1393,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1369,6 +1411,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1488,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@ -1742,9 +1787,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_in(notifier, raw_smp_processor_id()); } @@ -1753,9 +1797,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) { struct preempt_notifier *notifier; - struct hlist_node *node; - hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) notifier->ops->sched_out(notifier, next); } @@ -1850,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP @@ -1969,11 +2014,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1985,23 +2029,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; @@ -2131,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2357,12 +2384,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2522,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2582,7 +2609,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -2709,7 +2736,34 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); +} + +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; } +#endif notrace unsigned long get_parent_ip(unsigned long addr) { @@ -2786,7 +2840,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); - add_taint(TAINT_WARN); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* @@ -3007,51 +3061,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -3092,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - user_exit(); + prev_state = exception_enter(); + do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -3110,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void) */ barrier(); } while (need_resched()); + + exception_exit(prev_state); } #endif /* CONFIG_PREEMPT */ @@ -3268,7 +3281,8 @@ void complete_all(struct completion *x) EXPORT_SYMBOL(complete_all); static inline long __sched -do_wait_for_common(struct completion *x, long timeout, int state) +do_wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { if (!x->done) { DECLARE_WAITQUEUE(wait, current); @@ -3281,7 +3295,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) } __set_current_state(state); spin_unlock_irq(&x->wait.lock); - timeout = schedule_timeout(timeout); + timeout = action(timeout); spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); @@ -3292,17 +3306,30 @@ do_wait_for_common(struct completion *x, long timeout, int state) return timeout ?: 1; } -static long __sched -wait_for_common(struct completion *x, long timeout, int state) +static inline long __sched +__wait_for_common(struct completion *x, + long (*action)(long), long timeout, int state) { might_sleep(); spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, timeout, state); + timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); return timeout; } +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ + return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + /** * wait_for_completion: - waits for completion of a task * @x: holds the state of this particular completion @@ -3339,6 +3366,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) EXPORT_SYMBOL(wait_for_completion_timeout); /** + * wait_for_completion_io: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ + wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** * wait_for_completion_interruptible: - waits for completion of a task (w/intr) * @x: holds the state of this particular completion * @@ -4089,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4364,20 +4428,32 @@ EXPORT_SYMBOL(yield); * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * - * Returns true if we indeed boosted the target task. + * Returns: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ bool __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; unsigned long flags; - bool yielded = 0; + int yielded = 0; local_irq_save(flags); rq = this_rq(); again: p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } + double_rq_lock(rq, p_rq); while (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); @@ -4385,13 +4461,13 @@ again: } if (!curr->sched_class->yield_to_task) - goto out; + goto out_unlock; if (curr->sched_class != p->sched_class) - goto out; + goto out_unlock; if (task_running(p_rq, p) || p->state) - goto out; + goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { @@ -4404,11 +4480,12 @@ again: resched_task(p_rq->curr); } -out: +out_unlock: double_rq_unlock(rq, p_rq); +out_irq: local_irq_restore(flags); - if (yielded) + if (yielded > 0) schedule(); return yielded; @@ -4576,6 +4653,7 @@ void sched_show_task(struct task_struct *p) task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); + print_worker_info(KERN_INFO, p); show_stack(p, NULL); } @@ -4667,6 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif @@ -4722,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -4948,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, @@ -6197,7 +6271,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ @@ -6810,11 +6884,15 @@ int in_sched_functions(unsigned long addr) } #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6851,7 +6929,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6877,12 +6955,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -6946,9 +7018,12 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7160,7 +7235,6 @@ static void free_sched_group(struct task_group *tg) struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - unsigned long flags; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -7172,6 +7246,17 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); @@ -7181,12 +7266,6 @@ struct task_group *sched_create_group(struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); - - return tg; - -err: - free_sched_group(tg); - return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ @@ -7199,6 +7278,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ unsigned long flags; int i; @@ -7210,9 +7295,6 @@ void sched_destroy_group(struct task_group *tg) list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7397,7 +7479,7 @@ unlock: return err; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7409,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; @@ -7421,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) { u64 rt_runtime, rt_period; @@ -7434,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg) { u64 rt_period_us; @@ -7469,7 +7551,7 @@ static int sched_rt_global_constraints(void) return ret; } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept realtime tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -7508,6 +7590,25 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -7564,6 +7665,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } +static int cpu_cgroup_css_online(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct task_group *parent; + + if (!cgrp->parent) + return 0; + + parent = cgroup_tg(cgrp->parent); + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7571,6 +7685,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp) sched_destroy_group(tg); } +static void cpu_cgroup_css_offline(struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_offline_group(tg); +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -7926,6 +8047,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, + .css_online = cpu_cgroup_css_online, + .css_offline = cpu_cgroup_css_offline, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, @@ -7936,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = __parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 23aa789c53ee..1095e878a46f 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -28,6 +28,8 @@ */ #include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/sched/rt.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 293b202fcf79..cc2dc3eea8a3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,7 @@ #include <linux/tsacct_kern.h> #include <linux/kernel_stat.h> #include <linux/static_key.h> +#include <linux/context_tracking.h> #include "sched.h" @@ -114,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -126,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* @@ -163,7 +148,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for user time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -213,7 +198,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ - acct_update_integrals(p); + acct_account_cputime(p); } /* @@ -295,6 +280,7 @@ static __always_inline bool steal_account_process_tick(void) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; + cputime_t utime, stime; struct task_struct *t; times->utime = sig->utime; @@ -308,16 +294,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime += t->utime; - times->stime += t->stime; + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: rcu_read_unlock(); } -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -382,12 +367,90 @@ static void irqtime_account_idle_ticks(int ticks) irqtime_account_process_tick(current, 0, rq); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static void irqtime_account_idle_ticks(int ticks) {} -static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +static inline void irqtime_account_idle_ticks(int ticks) {} +static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (!vtime_accounting_enabled()) + return; + + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +/* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to * @user_tick: indicates if the tick is a user or a system tick @@ -397,6 +460,9 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (vtime_accounting_enabled()) + return; + if (sched_clock_irqtime) { irqtime_account_process_tick(p, user_tick, rq); return; @@ -439,88 +505,49 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(jiffies_to_cputime(ticks)); } -#endif - /* - * Use precise platform statistics if available: + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} - -void vtime_account_system_irqsafe(struct task_struct *tsk) +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) { - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); -} -#endif - -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account(struct task_struct *tsk) -{ - if (in_interrupt() || !is_idle_task(tsk)) - vtime_account_system(tsk); - else - vtime_account_idle(tsk); -} -EXPORT_SYMBOL_GPL(vtime_account); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - -#else - -#ifndef nsecs_to_cputime -# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -#endif - -static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); + u64 scaled; + + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; + } + + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; + + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; + + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; + + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; + +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; + } - return (__force cputime_t) temp; + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); + return (__force cputime_t) scaled; } /* @@ -531,10 +558,16 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime, total; + cputime_t rtime, stime, utime, total; + + if (vtime_accounting_enabled()) { + *ut = curr->utime; + *st = curr->stime; + return; + } - utime = curr->utime; - total = utime + curr->stime; + stime = curr->stime; + total = stime + curr->utime; /* * Tick based cputime accounting depend on random scheduling @@ -548,19 +581,32 @@ static void cputime_adjust(struct task_cputime *curr, */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); - if (total) - utime = scale_utime(utime, rtime, total); - else - utime = rtime; + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + if (total) { + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } else { + stime = rtime; + utime = 0; + } /* * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. */ + prev->stime = max(prev->stime, stime); prev->utime = max(prev->utime, utime); - prev->stime = max(prev->stime, rtime - prev->utime); +out: *ut = prev->utime; *st = prev->stime; } @@ -568,11 +614,10 @@ static void cputime_adjust(struct task_cputime *curr, void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { - .utime = p->utime, - .stime = p->stime, .sum_exec_runtime = p->se.sum_exec_runtime, }; + task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } @@ -586,4 +631,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static unsigned long long vtime_delta(struct task_struct *tsk) +{ + unsigned long long clock; + + clock = local_clock(); + if (clock < tsk->vtime_snap) + return 0; + + return clock - tsk->vtime_snap; +} + +static cputime_t get_vtime_delta(struct task_struct *tsk) +{ + unsigned long long delta = vtime_delta(tsk); + + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + tsk->vtime_snap += delta; + + /* CHECKME: always safe to convert nsecs to cputime? */ + return nsecs_to_cputime(delta); +} + +static void __vtime_account_system(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); +} + +void vtime_account_system(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_irq_exit(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + if (context_tracking_in_user()) + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_user(struct task_struct *tsk) +{ + cputime_t delta_cpu; + + if (!vtime_accounting_enabled()) + return; + + delta_cpu = get_vtime_delta(tsk); + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_SYS; + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_user_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + write_seqlock(&tsk->vtime_seqlock); + tsk->vtime_snap_whence = VTIME_USER; + __vtime_account_system(tsk); + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_enter(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags |= PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_guest_exit(struct task_struct *tsk) +{ + write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; + write_sequnlock(&tsk->vtime_seqlock); +} + +void vtime_account_idle(struct task_struct *tsk) +{ + cputime_t delta_cpu = get_vtime_delta(tsk); + + account_idle_time(delta_cpu); +} + +bool vtime_accounting_enabled(void) +{ + return context_tracking_active(); +} + +void arch_vtime_task_switch(struct task_struct *prev) +{ + write_seqlock(&prev->vtime_seqlock); + prev->vtime_snap_whence = VTIME_SLEEPING; + write_sequnlock(&prev->vtime_seqlock); + + write_seqlock(¤t->vtime_seqlock); + current->vtime_snap_whence = VTIME_SYS; + current->vtime_snap = sched_clock(); + write_sequnlock(¤t->vtime_seqlock); +} + +void vtime_init_idle(struct task_struct *t) +{ + unsigned long flags; + + write_seqlock_irqsave(&t->vtime_seqlock, flags); + t->vtime_snap_whence = VTIME_SYS; + t->vtime_snap = sched_clock(); + write_sequnlock_irqrestore(&t->vtime_seqlock, flags); +} + +cputime_t task_gtime(struct task_struct *t) +{ + unsigned int seq; + cputime_t gtime; + + do { + seq = read_seqbegin(&t->vtime_seqlock); + + gtime = t->gtime; + if (t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqretry(&t->vtime_seqlock, seq)); + + return gtime; +} + +/* + * Fetch cputime raw values from fields of task_struct and + * add up the pending nohz execution time since the last + * cputime snapshot. + */ +static void +fetch_task_cputime(struct task_struct *t, + cputime_t *u_dst, cputime_t *s_dst, + cputime_t *u_src, cputime_t *s_src, + cputime_t *udelta, cputime_t *sdelta) +{ + unsigned int seq; + unsigned long long delta; + + do { + *udelta = 0; + *sdelta = 0; + + seq = read_seqbegin(&t->vtime_seqlock); + + if (u_dst) + *u_dst = *u_src; + if (s_dst) + *s_dst = *s_src; + + /* Task is sleeping, nothing to add */ + if (t->vtime_snap_whence == VTIME_SLEEPING || + is_idle_task(t)) + continue; + + delta = vtime_delta(t); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { + *udelta = delta; + } else { + if (t->vtime_snap_whence == VTIME_SYS) + *sdelta = delta; + } + } while (read_seqretry(&t->vtime_seqlock, seq)); +} + + +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utime, stime, &t->utime, + &t->stime, &udelta, &sdelta); + if (utime) + *utime += udelta; + if (stime) + *stime += sdelta; +} + +void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, cputime_t *stimescaled) +{ + cputime_t udelta, sdelta; + + fetch_task_cputime(t, utimescaled, stimescaled, + &t->utimescaled, &t->stimescaled, &udelta, &sdelta); + if (utimescaled) + *utimescaled += cputime_to_scaled(udelta); + if (stimescaled) + *stimescaled += cputime_to_scaled(sdelta); +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..75024a673520 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - /* - * May be NULL if the underlying cgroup isn't fully-created yet - */ - if (!tg->css.cgroup) { - group_path[0] = '\0'; - return group_path; - } cgroup_path(tg->css.cgroup, group_path, PATH_MAX); return group_path; } @@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +323,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + SEQ_printf(m, "\n"); +} - for_each_online_cpu(cpu) - print_cpu(m, cpu); +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); - SEQ_printf(m, "\n"); + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 81fa53643409..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); + s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) - min_vruntime = vruntime; + max_vruntime = vruntime; - return min_vruntime; + return max_vruntime; } static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) @@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) vruntime = min_vruntime(vruntime, se->vruntime); } + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT smp_wmb(); @@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * We calculate the vruntime slice of a to be inserted task + * We calculate the vruntime slice of a to-be-inserted task. * * vs = s/w */ @@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -1680,9 +1702,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); - - se->vruntime = vruntime; + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -3254,25 +3274,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_idle_sibling(struct task_struct *p, int target) { - int cpu = smp_processor_id(); - int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i; + int i = task_cpu(p); - /* - * If the task is going to be woken-up on this cpu and if it is - * already idle, then it is the right target. - */ - if (target == cpu && idle_cpu(cpu)) - return cpu; + if (idle_cpu(target)) + return target; /* - * If the task is going to be woken-up on the cpu where it previously - * ran and if it is currently idle, then it the right target. + * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (target == prev_cpu && idle_cpu(prev_cpu)) - return prev_cpu; + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -3286,7 +3299,7 @@ static int select_idle_sibling(struct task_struct *p, int target) goto next; for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) + if (i == target || !idle_cpu(i)) goto next; } @@ -3883,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3903,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -3929,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* @@ -3957,9 +3974,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4011,7 +4025,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4022,9 +4036,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; @@ -4254,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } @@ -4264,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) return default_scale_freq_power(sd, cpu); } -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -4279,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return default_scale_smt_power(sd, cpu); } -unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -4969,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -5000,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, @@ -5016,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -5043,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5070,17 +5085,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group @@ -5100,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5228,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5339,7 +5355,7 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5404,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5419,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5477,7 +5499,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { @@ -5515,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } @@ -5549,9 +5572,9 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5694,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif @@ -6101,7 +6124,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); return rr_interval; } @@ -6164,7 +6187,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/* * Decrement CPU power based on time not spent running tasks */ SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..d8da01008d39 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); + rq_last_tick_reset(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4f02b2847357..127a2c4cf4ab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,8 @@ #include <linux/slab.h> +int sched_rr_timeslice = RR_TIMESLICE; + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; @@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq) return; delta_exec = rq->clock_task - curr->se.exec_start; - if (unlikely((s64)delta_exec < 0)) - delta_exec = 0; + if (unlikely((s64)delta_exec <= 0)) + return; schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; return 0; } @@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->on_rq && !rq->rt.rt_nr_running) - pull_rt_task(rq); + if (!p->on_rq || rq->rt.rt_nr_running) + return; + + if (pull_rt_task(rq)) + resched_task(rq->curr); } void init_sched_rt_class(void) @@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (soft != RLIM_INFINITY) { unsigned long next; - p->rt.timeout++; + if (p->rt.watchdog_stamp != jiffies) { + p->rt.timeout++; + p->rt.watchdog_stamp = jiffies; + } + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) p->cputime_expires.sched_exp = p->se.sum_exec_runtime; @@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = RR_TIMESLICE; + p->rt.time_slice = sched_rr_timeslice; /* * Requeue to the end of queue if we (and all of our ancestors) are the @@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return RR_TIMESLICE; + return sched_rr_timeslice; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc886441436a..ce39224d6155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,10 +1,14 @@ #include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/tick.h> #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -31,6 +35,31 @@ extern __read_mostly int scheduler_running; */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT @@ -152,11 +181,6 @@ struct task_group { #define MAX_SHARES (1UL << 18) #endif -/* Default task group. - * Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; - typedef int (*tg_visitor)(struct task_group *, void *); extern int walk_tg_tree_from(struct task_group *from, @@ -194,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif + #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; @@ -370,10 +406,13 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif int skip_clock_update; /* capture load from *all* tasks on this cpu: */ @@ -545,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +struct sched_group_power { + atomic_t ref; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. + */ + unsigned int power, power_orig; + unsigned long next_update; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_power *sgp; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + extern int group_balance_cpu(struct sched_group *sg); #endif /* CONFIG_SMP */ @@ -782,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -854,14 +955,61 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif - CPUACCT_STAT_NSTATS, -}; +#define DEQUEUE_SLEEP 1 +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq |