summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks9
-rw-r--r--kernel/cgroup.c461
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/events/core.c34
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/mcs_spinlock.c64
-rw-r--r--kernel/locking/mcs_spinlock.h9
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rwsem-spinlock.c28
-rw-r--r--kernel/locking/rwsem-xadd.c16
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/rcu/rcutorture.c4
-rw-r--r--kernel/rcu/tree.c140
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c22
-rw-r--r--kernel/sched/core.c9
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/time/alarmtimer.c20
-rw-r--r--kernel/time/clockevents.c10
-rw-r--r--kernel/time/sched_clock.c4
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c445
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c98
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_clock.c9
-rw-r--r--kernel/trace/trace_events.c56
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c282
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_seq.c428
-rw-r--r--kernel/workqueue.c206
42 files changed, 1918 insertions, 1072 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+ bool
+
config MUTEX_SPIN_ON_OWNER
def_bool y
- depends on SMP && !DEBUG_MUTEXES
+ depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
+config RWSEM_SPIN_ON_OWNER
+ def_bool y
+ depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
config ARCH_USE_QUEUE_RWLOCK
bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
*/
static bool cgrp_dfl_root_visible;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
/* some controllers are not supported in the default hierarchy */
-static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
-#ifdef CONFIG_CGROUP_DEBUG
- | (1 << debug_cgrp_id)
-#endif
- ;
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask. The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = cgrp->subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->child_subsys_mask = cur_ss_mask;
+ return;
+ }
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ cgrp->child_subsys_mask = cur_ss_mask;
+}
+
+/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
up_write(&css_set_rwsem);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root)
- dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
if (ss->bind)
ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
bool all_ss = false, one_ss = false;
unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /* Consistency checks */
-
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
- opts->cpuset_clone_children || opts->release_agent ||
- opts->name) {
- pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
- } else {
- /*
- * If the 'all' option was specified select all the
- * subsystems, otherwise if 'none', 'name=' and a subsystem
- * name options were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So
- * all empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ return 0;
}
/*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
struct css_set *cset;
int i, ret;
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
if (ret)
goto destroy_root;
@@ -1638,7 +1698,7 @@ destroy_root:
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
- percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+ percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
/* look for a matching existing root */
- if (!opts.subsys_mask && !opts.none && !opts.name) {
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto out_unlock;
- } else {
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
* We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_puts(seq, "0\n");
return 0;
}
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
return 0;
}
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
- if (cgrp->child_subsys_mask & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ /*
+ * @ss is already enabled through dependency and
+ * we'll just make it visible. Skip draining.
+ */
+ if (cgrp->child_subsys_mask & (1 << ssid))
+ continue;
+
/*
* Because css offlining is asynchronous, userland
* might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return restart_syscall();
}
-
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
- ret = -ENOENT;
- goto out_unlock;
- }
} else if (disable & (1 << ssid)) {
- if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
- if (child->child_subsys_mask & (1 << ssid)) {
+ if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Except for the root, child_subsys_mask must be zero for a cgroup
+ * Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Create csses for enables and update child_subsys_mask. This
- * changes cgroup_e_css() results which in turn makes the
- * subsequent cgroup_update_dfl_csses() associate all tasks in the
- * subtree to the updated csses.
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
+
+ old_ctrl = cgrp->child_subsys_mask;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ new_ctrl = cgrp->child_subsys_mask;
+
+ css_enable = ~old_ctrl & new_ctrl;
+ css_disable = old_ctrl & ~new_ctrl;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
- ret = create_css(child, ss);
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
if (ret)
goto err_undo_css;
}
}
- cgrp->child_subsys_mask |= enable;
- cgrp->child_subsys_mask &= ~disable;
-
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
- /* all tasks are now migrated away from the old csses, kill them */
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
- cgroup_for_each_live_child(child, cgrp)
- kill_css(cgroup_css(child, ss));
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
}
kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->child_subsys_mask &= ~enable;
- cgrp->child_subsys_mask |= disable;
+ cgrp->subtree_control &= ~enable;
+ cgrp->subtree_control |= disable;
+ cgroup_refresh_child_subsys_mask(cgrp);
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
- if (css)
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
}
}
goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
/*
* This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * limited. Disallow on the default hierarchy.
*/
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return -EPERM;
/*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
}
/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
@@ -3699,8 +3841,9 @@ after:
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if
- * sane_behavior so that no such expectation exists in the new interface.
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
return 0;
}
-static struct cftype cgroup_base_files[] = {
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
.mode = S_IRUGO | S_IWUSR,
},
{
- .name = "cgroup.clone_children",
- .flags = CFTYPE_INSANE,
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_root_controllers_show,
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
- .flags = CFTYPE_ONLY_ON_DFL,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
{
.name = "cgroup.populated",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_populated_show,
},
+ { } /* terminate */
+};
- /*
- * Historical crazy stuff. These don't have "cgroup." prefix and
- * don't exist if sane_behavior. If you're depending on these, be
- * prepared to be burned.
- */
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
{
.name = "tasks",
- .flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "notify_on_release",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
- .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
+ percpu_ref_exit(&css->refcnt);
+
if (css->ss) {
/* css free path */
if (css->parent)
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created.
- * Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
goto err_free_percpu_ref;
css->id = err;
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
- if (err)
- goto err_free_id;
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4527,7 @@ err_list_del:
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
- percpu_ref_cancel_init(&css->refcnt);
+ percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
+ struct cftype *base_files;
int ssid, ret;
parent = cgroup_kn_lock_live(parent_kn);
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
if (ret)
goto out_destroy;
/* let's create and online css's */
for_each_subsys(ss, ssid) {
if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss);
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
if (ret)
goto out_destroy;
}
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
/*
* On the default hierarchy, a child doesn't automatically inherit
- * child_subsys_mask from the parent. Each is configured manually.
+ * subtree_control from the parent. Each is configured manually.
*/
- if (!cgroup_on_dfl(cgrp))
- cgrp->child_subsys_mask = parent->child_subsys_mask;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
kernfs_activate(kn);
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
out_free_id:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
- percpu_ref_cancel_init(&cgrp->self.refcnt);
+ percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock:
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts =
- { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex);
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (!ss->disabled) {
- cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
}
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = {
struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .base_cftypes = debug_files,
+ .legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
.css_free = freezer_css_free,
.attach = freezer_attach,
.fork = freezer_fork,
- .base_cftypes = files,
+ .legacy_cftypes = files,
};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
/*
* This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
/*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
- kfree(trial);
- return NULL;
- }
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+free_cs:
+ kfree(trial);
+ return NULL;
}
/**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static void free_trial_cpuset(struct cpuset *trial)
{
+ free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
- /* We must be a subset of our parent cpuset */
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!is_cpuset_subset(trial, par))
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -480,11 +514,11 @@ out:
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);
goto done;
}
@@ -705,7 +739,7 @@ restart:
struct cpuset *b = csa[j];
if (apn == b->pn) {
- cpumask_or(dp, dp, b->cpus_allowed);
+ cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
}
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
- while (cpumask_empty(cs->cpus_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- * if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
- while (nodes_empty(cs->mems_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, &it);
while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
}
/*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
*
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
}
/**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
- int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
return -EINVAL;
}
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask_hier(cs, true);
-
- if (is_load_balanced)
- rebuild_sched_domains_locked();
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
return 0;
}
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
- struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
}
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(mems_cs, &newmems);
+ guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
}
/*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
*
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!nodes_empty(cp->mems_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cp->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
update_tasks_nodemask(cp);
rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_MEMORY])) {
- retval = -EINVAL;
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
goto done;
}
}
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask_hier(cs, true);
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
done:
return retval;
}
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
mutex_lock(&cpuset_mutex);
- /*
- * We allow to move tasks into an empty cpuset if sane_behavior
- * flag is set.
- */
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_sane_behavior(css->cgroup) &&
+ if (!cgroup_on_dfl(css->cgroup) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
struct task_struct *leader = cgroup_taskset_first(tset);
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
mutex_lock(&cpuset_mutex);
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cpus_cs, cpus_attach);
+ guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, tset) {
/*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->effective_mems;
mm = get_task_mm(leader);
if (mm) {
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* mm from.
*/
if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+ case FILE_EFFECTIVE_CPULIST:
+ s += cpulist_scnprintf(s, count, cs->effective_cpus);
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ s += nodelist_scnprintf(s, count, cs->effective_mems);
+ break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
},
{
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
+
+free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
+ mutex_lock(&callback_mutex);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ mutex_unlock(&callback_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ mutex_lock(&callback_mutex);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .base_cftypes = files,
- .early_init = 1,
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
};
/**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
}
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- static cpumask_t off_cpus;
- static nodemask_t off_mems;
- bool is_empty;
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2064,51 +2207,20 @@ retry:
goto retry;
}
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' cpumask
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
- * call update_tasks_cpumask() if the cpuset becomes empty, as
- * the tasks in it will be migrated to an ancestor.
- */
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs);
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' nodemask
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
- * call update_tasks_nodemask() if the cpuset becomes empty, as
- * the tasks in it will be migratd to an ancestor.
- */
- if ((sane && nodes_empty(cs->mems_allowed)) ||
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs);
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
mutex_unlock(&cpuset_mutex);
-
- /*
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
- *
- * Otherwise move tasks to the nearest ancestor with execution
- * resources. This is full cgroup operation which will
- * also call back into cpuset. Should be done outside any lock.
- */
- if (!sane && is_empty)
- remove_tasks_in_empty_cpuset(cs);
}
/**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
mutex_lock(&cpuset_mutex);
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = new_mems;
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset);
}
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- struct cpuset *cpus_cs;
-
mutex_lock(&callback_mutex);
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- guarantee_online_cpus(cpus_cs, pmask);
+ guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- struct cpuset *cpus_cs;
-
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
rcu_read_unlock();
/*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &mask);
+ guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..6b17ac1b0c2a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
- if (!parent && !next_parent)
+ if (!parent || !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -7458,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- perf_remove_from_context(child_event, true);
+ /*
+ * Do not destroy the 'original' grouping; because of the context
+ * switch optimization the original events could've ended up in a
+ * random child task.
+ *
+ * If we were to destroy the original group, all group related
+ * operations would cease to function properly after this random
+ * child dies.
+ *
+ * Do destroy all inherited groups, we don't care about those
+ * and being thorough is better.
+ */
+ perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
@@ -7474,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event, *next;
- struct perf_event_context *child_ctx;
+ struct perf_event_context *child_ctx, *parent_ctx;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
+
+ /*
+ * In order to avoid freeing: child_ctx->parent_ctx->task
+ * under perf_event_context::lock, grab another reference.
+ */
+ parent_ctx = child_ctx->parent_ctx;
+ if (parent_ctx)
+ get_ctx(parent_ctx);
+
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
@@ -7509,6 +7530,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
+ * Now that we no longer hold perf_event_context::lock, drop
+ * our extra child_ctx->parent_ctx reference.
+ */
+ if (parent_ctx)
+ put_ctx(parent_ctx);
+
+ /*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..4b8f0c925884 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
#include <linux/swap.h>
#include <linux/syscore_ops.h>
#include <linux/compiler.h>
+#include <linux/hugetlb.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void)
#endif
VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
arch_crash_save_vmcoreinfo();
update_vmcoreinfo_note();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
{
unsigned long *iter;
struct kprobe_blacklist_entry *ent;
- unsigned long offset = 0, size = 0;
+ unsigned long entry, offset = 0, size = 0;
for (iter = start; iter < end; iter++) {
- if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
- pr_err("Failed to find blacklist %p\n", (void *)*iter);
+ entry = arch_deref_entry_point((void *)*iter);
+
+ if (!kernel_text_address(entry) ||
+ !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+ pr_err("Failed to find blacklist at %p\n",
+ (void *)entry);
continue;
}
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
return -ENOMEM;
- ent->start_addr = *iter;
- ent->end_addr = *iter + size;
+ ent->start_addr = entry;
+ ent->end_addr = entry + size;
INIT_LIST_HEAD(&ent->list);
list_add_tail(&ent->list, &kprobe_blacklist);
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
list_add_tail(&work->node, pos);
work->worker = worker;
- if (likely(worker->task))
+ if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
- struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
{
- struct optimistic_spin_queue *next = NULL;
+ struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
node->locked = 0;
node->next = NULL;
+ node->cpu = curr;
- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
return true;
+ prev = decode_cpu(old);
+ node->prev = prev;
ACCESS_ONCE(prev->next) = node;
/*
@@ -149,20 +180,21 @@ unqueue:
return false;
}
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_node *node, *next;
+ int curr = encode_cpu(smp_processor_id());
/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
return;
/*
* Second most likely case.
*/
+ node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
* mutex_lock()/rwsem_down_{read,write}() etc.
*/
-struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
};
-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ osq_lock_init(&lock->osq);
#endif
debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
unsigned long flags;
if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->activity != 0);
+ ret = (sem->count != 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
}
return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->activity = 0;
+ sem->count = 0;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
}
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
waiter = list_entry(next, struct rwsem_waiter, list);
} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
- sem->activity += woken;
+ sem->count += woken;
out:
return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
goto out;
}
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+ if (sem->count >= 0 && list_empty(&sem->wait_list)) {
/* granted */
- sem->activity++;
+ sem->count++;
ret = 1;
}
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
* itself into sleep and waiting for system woke it or someone
* else in the head of the wait list up.
*/
- if (sem->activity == 0)
+ if (sem->count == 0)
break;
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (sem->activity == 0) {
+ if (sem->count == 0) {
/* got the lock */
- sem->activity = -1;
+ sem->count = -1;
ret = 1;
}
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+ if (--sem->count == 0 && !list_empty(&sem->wait_list))
sem = __rwsem_wake_one_writer(sem);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 0;
+ sem->count = 0;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
raw_spin_lock_irqsave(&sem->wait_lock, flags);
- sem->activity = 1;
+ sem->count = 1;
if (!list_empty(&sem->wait_list))
sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
sem->owner = NULL;
- sem->osq = NULL;
+ osq_lock_init(&sem->osq);
#endif
}
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
return false;
}
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
* Try to acquire write lock before the writer has been put on wait queue.
*/
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
- bool on_cpu = true;
+ bool on_cpu = false;
if (need_resched())
- return 0;
+ return false;
rcu_read_lock();
owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
rcu_read_unlock();
/*
- * If sem->owner is not set, the rwsem owner may have
- * just acquired it and not set the owner yet or the rwsem
- * has been released.
+ * If sem->owner is not set, yet we have just recently entered the
+ * slowpath, then there is a possibility reader(s) may have the lock.
+ * To be safe, avoid spinning in these situations.
*/
return on_cpu;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
#include <linux/atomic.h>
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
sem->owner = current;
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..ae79ce615cb9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
-#include <linux/fips.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
}
/* Not having a signature is only an error if we're strict. */
- if (err < 0 && fips_enabled)
- panic("Module verification failed with error %d in FIPS mode\n",
- err);
if (err == -ENOKEY && !sig_enforce)
err = 0;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
}
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
dpm_complete(msg);
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- ftrace_start();
resume_console();
Close:
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
printk("Restarting tasks ... ");
+ __usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..4b736b4dfa96 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -248,7 +248,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -275,7 +274,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
- ftrace_start();
Platform_wake:
if (need_suspend_ops(state) && suspend_ops->wake)
@@ -306,7 +304,7 @@ int suspend_devices_and_enter(suspend_state_t state)
error = suspend_ops->begin(state);
if (error)
goto Close;
- } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
+ } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
error = freeze_ops->begin();
if (error)
goto Close;
@@ -335,7 +333,7 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
- else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
+ else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
return error;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..948a7693748e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -18,7 +18,7 @@
* Copyright (C) IBM Corporation, 2005, 2006
*
* Authors: Paul E. McKenney <paulmck@us.ibm.com>
- * Josh Triplett <josh@freedesktop.org>
+ * Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
*/
@@ -51,7 +51,7 @@
#include <linux/torture.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
torture_param(int, fqs_duration, 0,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
rdp->passed_quiesce = 1;
}
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+ .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+ .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+ .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state. This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp;
+ int resched_mask;
+ struct rcu_state *rsp;
+
+ local_irq_save(flags);
+
+ /*
+ * Yes, we can lose flag-setting operations. This is OK, because
+ * the flag will be set again after some delay.
+ */
+ resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+ raw_cpu_write(rcu_sched_qs_mask, 0);
+
+ /* Find the flavor that needs a quiescent state. */
+ for_each_rcu_flavor(rsp) {
+ rdp = raw_cpu_ptr(rsp->rda);
+ if (!(resched_mask & rsp->flavor_mask))
+ continue;
+ smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+ if (ACCESS_ONCE(rdp->mynode->completed) !=
+ ACCESS_ONCE(rdp->cond_resched_completed))
+ continue;
+
+ /*
+ * Pretend to be momentarily idle for the quiescent state.
+ * This allows the grace-period kthread to record the
+ * quiescent state, with no need for this CPU to do anything
+ * further.
+ */
+ rdtp = this_cpu_ptr(&rcu_dynticks);
+ smp_mb__before_atomic(); /* Earlier stuff before QS. */
+ atomic_add(2, &rdtp->dynticks); /* QS. */
+ smp_mb__after_atomic(); /* Later stuff after QS. */
+ break;
+ }
+ local_irq_restore(flags);
+}
+
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
- .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
- .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
- .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
-
static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000; /* If this many pending, ignore blimit. */
static long qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
bool *isidle, unsigned long *maxj)
{
unsigned int curr;
+ int *rcrmp;
unsigned int snap;
curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
}
/*
- * There is a possibility that a CPU in adaptive-ticks state
- * might run in the kernel with the scheduling-clock tick disabled
- * for an extended time period. Invoke rcu_kick_nohz_cpu() to
- * force the CPU to restart the scheduling-clock tick in this
- * CPU is in this state.
- */
- rcu_kick_nohz_cpu(rdp->cpu);
-
- /*
- * Alternatively, the CPU might be running in the kernel
- * for an extended period of time without a quiescent state.
- * Attempt to force the CPU through the scheduler to gain the
- * needed quiescent state, but only if the grace period has gone
- * on for an uncommonly long time. If there are many stuck CPUs,
- * we will beat on the first one until it gets unstuck, then move
- * to the next. Only do this for the primary flavor of RCU.
+ * A CPU running for an extended time within the kernel can
+ * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode,
+ * even context-switching back and forth between a pair of
+ * in-kernel CPU-bound tasks cannot advance grace periods.
+ * So if the grace period is old enough, make the CPU pay attention.
+ * Note that the unsynchronized assignments to the per-CPU
+ * rcu_sched_qs_mask variable are safe. Yes, setting of
+ * bits can be lost, but they will be set again on the next
+ * force-quiescent-state pass. So lost bit sets do not result
+ * in incorrect behavior, merely in a grace period lasting
+ * a few jiffies longer than it might otherwise. Because
+ * there are at most four threads involved, and because the
+ * updates are only once every few jiffies, the probability of
+ * lossage (and thus of slight grace-period extension) is
+ * quite low.
+ *
+ * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+ * is set too high, we override with half of the RCU CPU stall
+ * warning delay.
*/
- if (rdp->rsp == rcu_state_p &&
+ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+ if (ULONG_CMP_GE(jiffies,
+ rdp->rsp->gp_start + jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
- rdp->rsp->jiffies_resched += 5;
- resched_cpu(rdp->cpu);
+ if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+ ACCESS_ONCE(rdp->cond_resched_completed) =
+ ACCESS_ONCE(rdp->mynode->completed);
+ smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+ ACCESS_ONCE(*rcrmp) =
+ ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+ } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+ /* Time to beat on that CPU again! */
+ resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
+ rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ }
}
return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
"rcu_node_fqs_1",
"rcu_node_fqs_2",
"rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
for (i = 1; i < rcu_num_lvls; i++)
rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
rcu_init_levelspread(rsp);
+ rsp->flavor_mask = fl_mask;
+ fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
unsigned long offline_fqs; /* Kicked due to being offline. */
+ unsigned long cond_resched_completed;
+ /* Grace period that needs help */
+ /* from cond_resched(). */
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
* if an adaptive-ticks CPU is failing to respond to the current grace
* period and has not be idle from an RCU perspective, kick it.
*/
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
EXPORT_SYMBOL_GPL(wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
{
debug_object_init(head, &rcuhead_debug_descr);
}
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
{
debug_object_free(head, &rcuhead_debug_descr);
}
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends. Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
- preempt_disable();
- __this_cpu_write(rcu_cond_resched_count, 0);
- rcu_note_context_switch(smp_processor_id());
- preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..126f7e3f04e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
int __sched _cond_resched(void)
{
- rcu_cond_resched();
if (should_resched()) {
__cond_resched();
return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- bool need_rcu_resched = rcu_should_resched();
int resched = should_resched();
int ret = 0;
lockdep_assert_held(lock);
- if (spin_needbreak(lock) || resched || need_rcu_resched) {
+ if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
if (resched)
__cond_resched();
- else if (unlikely(need_rcu_resched))
- rcu_resched();
else
cpu_relax();
ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
if (should_resched()) {
local_bh_enable();
__cond_resched();
@@ -8088,7 +8083,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .base_cftypes = cpu_files,
+ .legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .base_cftypes = files,
+ .legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
- do_div(avg_atom, nr_switches);
+ avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ ktime_t exp;
+
if (!rtcdev)
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (old_setting)
alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
- alarm_start(&timr->it.alarm.alarmtimer,
- timespec_to_ktime(new_setting->it_value));
+ exp = timespec_to_ktime(new_setting->it_value);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now;
+
+ now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ alarm_start(&timr->it.alarm.alarmtimer, exp);
return 0;
}
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (!alarmtimer_get_rtcdev())
return -ENOTSUPP;
+ if (flags & ~TIMER_ABSTIME)
+ return -EINVAL;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ad362c260ef4..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
{
/* Nothing to do if we already reached the limit */
if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
- printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+ printk_deferred(KERN_WARNING
+ "CE: Reprogramming failure. Giving up\n");
dev->next_event.tv64 = KTIME_MAX;
return -ETIME;
}
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
if (dev->min_delta_ns > MIN_DELTA_LIMIT)
dev->min_delta_ns = MIN_DELTA_LIMIT;
- printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
- dev->name ? dev->name : "?",
- (unsigned long long) dev->min_delta_ns);
+ printk_deferred(KERN_WARNING
+ "CE: %s increased min_delta_ns to %llu nsec\n",
+ dev->name ? dev->name : "?",
+ (unsigned long long) dev->min_delta_ns);
return 0;
}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d2c729..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void)
{
- sched_clock_poll(&sched_clock_timer);
+ update_sched_clock();
+ hrtimer_cancel(&sched_clock_timer);
cd.suspended = true;
return 0;
}
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void)
{
cd.epoch_cyc = read_sched_clock();
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
cd.suspended = false;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_TRACE_MCOUNT_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac9d1dad630b..1654b12c891a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
-/* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
-
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
#ifdef CONFIG_DYNAMIC_FTRACE
+static struct ftrace_ops *removed_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
/*
- * Remove the current set, update the hash and add
- * them back.
- */
- ftrace_hash_rec_disable(ops, enable);
-
- /*
* If the new source is empty, just free dst and assign it
* the empty_hash.
*/
if (!src->count) {
- free_ftrace_hash_rcu(*dst);
- rcu_assign_pointer(*dst, EMPTY_HASH);
- /* still need to update the function records */
- ret = 0;
- goto out;
+ new_hash = EMPTY_HASH;
+ goto update;
}
/*
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
if (bits > FTRACE_HASH_MAX_BITS)
bits = FTRACE_HASH_MAX_BITS;
- ret = -ENOMEM;
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- goto out;
+ return -ENOMEM;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
+update:
+ /*
+ * Remove the current set, update the hash and add
+ * them back.
+ */
+ ftrace_hash_rec_disable(ops, enable);
+
old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
free_ftrace_hash_rcu(old_hash);
- ret = 0;
- out:
- /*
- * Enable regardless of ret:
- * On success, we enable the new hash.
- * On failure, we re-enable the original hash.
- */
ftrace_hash_rec_enable(ops, enable);
- return ret;
+ return 0;
}
/*
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)
return (int)!!ret;
}
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ bool keep_regs = false;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(ops, rec->ip, rec)) {
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ keep_regs = true;
+ break;
+ }
+ }
+ }
+
+ return keep_regs;
+}
+
+static void ftrace_remove_tramp(struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
+ if (!entry)
+ return;
+
+ /*
+ * The tramp_hash entry will be removed at time
+ * of update.
+ */
+ ops->nr_trampolines--;
+ rec->flags &= ~FTRACE_FL_TRAMP;
+}
+
+static void ftrace_clear_tramps(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->nr_trampolines)
+ ftrace_remove_tramp(op, rec);
+ } while_for_each_ftrace_op(op);
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (inc) {
rec->flags++;
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return;
+
+ /*
+ * If there's only a single callback registered to a
+ * function, and the ops has a trampoline registered
+ * for it, then we can call it directly.
+ */
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+ rec->flags |= FTRACE_FL_TRAMP;
+ ops->nr_trampolines++;
+ } else {
+ /*
+ * If we are adding another function callback
+ * to this function, and the previous had a
+ * trampoline used, then we need to go back to
+ * the default trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /* remove trampolines from any ops for this rec */
+ ftrace_clear_tramps(rec);
+ }
+
/*
* If any ops wants regs saved for this function
* then all ops will get saved regs.
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
rec->flags |= FTRACE_FL_REGS;
} else {
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
return;
rec->flags--;
+
+ if (ops->trampoline && !ftrace_rec_count(rec))
+ ftrace_remove_tramp(ops, rec);
+
+ /*
+ * If the rec had REGS enabled and the ops that is
+ * being removed had REGS set, then see if there is
+ * still any ops for this record that wants regs.
+ * If not, we can stop recording them.
+ */
+ if (ftrace_rec_count(rec) > 0 &&
+ rec->flags & FTRACE_FL_REGS &&
+ ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ if (!test_rec_ops_needs_regs(rec))
+ rec->flags &= ~FTRACE_FL_REGS;
+ }
+
+ /*
+ * flags will be cleared in ftrace_check_record()
+ * if rec count is zero.
+ */
}
count++;
/* Shortcut, if we handled all records, we are done. */
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* If we are disabling calls, then disable all records that
* are enabled.
*/
- if (enable && (rec->flags & ~FTRACE_FL_MASK))
+ if (enable && ftrace_rec_count(rec))
flag = FTRACE_FL_ENABLED;
/*
- * If enabling and the REGS flag does not match the REGS_EN, then
- * do not ignore this record. Set flags to fail the compare against
- * ENABLED.
+ * If enabling and the REGS flag does not match the REGS_EN, or
+ * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
+ * this record. Set flags to fail the compare against ENABLED.
*/
- if (flag &&
- (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
- flag |= FTRACE_FL_REGS;
+ if (flag) {
+ if (!(rec->flags & FTRACE_FL_REGS) !=
+ !(rec->flags & FTRACE_FL_REGS_EN))
+ flag |= FTRACE_FL_REGS;
+
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ !(rec->flags & FTRACE_FL_TRAMP_EN))
+ flag |= FTRACE_FL_TRAMP;
+ }
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
else
rec->flags &= ~FTRACE_FL_REGS_EN;
}
+ if (flag & FTRACE_FL_TRAMP) {
+ if (rec->flags & FTRACE_FL_TRAMP)
+ rec->flags |= FTRACE_FL_TRAMP_EN;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP_EN;
+ }
}
/*
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* Otherwise,
* return UPDATE_MODIFY_CALL to tell the caller to convert
* from the save regs, to a non-save regs function or
- * vice versa.
+ * vice versa, or from a trampoline call.
*/
if (flag & FTRACE_FL_ENABLED)
return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (update) {
/* If there's no more users, clear all flags */
- if (!(rec->flags & ~FTRACE_FL_MASK))
+ if (!ftrace_rec_count(rec))
rec->flags = 0;
else
/* Just disable the record (keep REGS state) */
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ /* Removed ops need to be tested first */
+ if (removed_ops && removed_ops->tramp_hash) {
+ if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+ return removed_ops;
+ }
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!op->tramp_hash)
+ continue;
+
+ if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+ return op;
+
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(op, rec->ip, rec))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP) {
+ ops = ftrace_find_tramp_ops_new(rec);
+ if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (FTRACE_WARN_ON(!ops)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS_EN)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)
ftrace_run_stop_machine(command);
}
+static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int size, bits;
+ int ret;
+
+ size = ops->nr_trampolines;
+ bits = 0;
+ /*
+ * Make the hash size about 1/2 the # found
+ */
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ ops->tramp_hash = alloc_ftrace_hash(bits);
+ /*
+ * TODO: a failed allocation is going to screw up
+ * the accounting of what needs to be modified
+ * and not. For now, we kill ftrace if we fail
+ * to allocate here. But there are ways around this,
+ * but that will take a little more work.
+ */
+ if (!ops->tramp_hash)
+ return -ENOMEM;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_test(ops, rec->ip, rec)) {
+
+ /*
+ * If another ops adds to a rec, the rec will
+ * lose its trampoline and never get it back
+ * until all ops are off of it.
+ */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ continue;
+
+ /* This record had better have a trampoline */
+ if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
+ return -1;
+
+ ret = add_hash_entry(ops->tramp_hash, rec->ip);
+ if (ret < 0)
+ return ret;
+ }
+ } while_for_each_ftrace_rec();
+
+ /* The number of recs in the hash must match nr_trampolines */
+ FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
+
+ return 0;
+}
+
+static int ftrace_save_tramp_hashes(void)
+{
+ struct ftrace_ops *op;
+ int ret;
+
+ /*
+ * Now that any trampoline is being used, we need to save the
+ * hashes for the ops that have them. This allows the mapping
+ * back from the record to the ops that has the trampoline to
+ * know what code is being replaced. Modifying code must always
+ * verify what it is changing.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ /* The tramp_hash is recreated each time. */
+ free_ftrace_hash(op->tramp_hash);
+ op->tramp_hash = NULL;
+
+ if (op->nr_trampolines) {
+ ret = ftrace_save_ops_tramp_hash(op);
+ if (ret)
+ return ret;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return 0;
+}
+
static void ftrace_run_update_code(int command)
{
int ret;
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine.
- */
- function_trace_stop++;
/*
* By default we use stop_machine() to modify the code.
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- function_trace_stop--;
-
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+
+ ret = ftrace_save_tramp_hashes();
+ FTRACE_WARN_ON(ret);
}
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
-static int global_start_up;
static void control_ops_free(struct ftrace_ops *ops)
{
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
ftrace_hash_rec_disable(ops, 1);
- if (!global_start_up)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
return 0;
}
+ /*
+ * If the ops uses a trampoline, then it needs to be
+ * tested first on update.
+ */
+ removed_ops = ops;
+
ftrace_run_update_code(command);
+ removed_ops = NULL;
+
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- while (start_pg) {
+ pg = start_pg;
+ while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->filter_hash)) {
+ if ((iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->filter_hash)) ||
+ (iter->flags & FTRACE_ITER_NOTRACE &&
+ ftrace_hash_empty(ops->notrace_hash))) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)
return t_hash_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
- seq_printf(m, "#### all functions enabled ####\n");
+ if (iter->flags & FTRACE_ITER_NOTRACE)
+ seq_printf(m, "#### no functions disabled ####\n");
+ else
+ seq_printf(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)
return 0;
seq_printf(m, "%ps", (void *)rec->ip);
- if (iter->flags & FTRACE_ITER_ENABLED)
+ if (iter->flags & FTRACE_ITER_ENABLED) {
seq_printf(m, " (%ld)%s",
- rec->flags & ~FTRACE_FL_MASK,
- rec->flags & FTRACE_FL_REGS ? " R" : "");
+ ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ struct ftrace_ops *ops;
+
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (ops && ops->trampoline)
+ seq_printf(m, "\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ seq_printf(m, "\ttramp: ERROR!");
+ }
+ }
+
seq_printf(m, "\n");
return 0;
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return iter ? 0 : -ENOMEM;
}
-static void ftrace_filter_reset(struct ftrace_hash *hash)
-{
- mutex_lock(&ftrace_lock);
- ftrace_hash_clear(hash);
- mutex_unlock(&ftrace_lock);
-}
-
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
hash = ops->filter_hash;
if (file->f_mode & FMODE_WRITE) {
- iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (file->f_flags & O_TRUNC)
+ iter->hash = alloc_ftrace_hash(size_bits);
+ else
+ iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- ftrace_filter_reset(iter->hash);
-
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
else
orig_hash = &ops->notrace_hash;
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (reset)
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ else
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+
if (!hash) {
ret = -ENOMEM;
goto out_regex_unlock;
}
- if (reset)
- ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
ret = -EINVAL;
goto out_regex_unlock;
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
static int __init set_graph_function(char *str)
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)
}
__setup("ftrace_graph_filter=", set_graph_function);
-static void __init set_ftrace_early_graph(char *buf)
+static int __init set_graph_notrace_function(char *str)
+{
+ strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+
+static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
+ unsigned long *table = ftrace_graph_funcs;
+ int *count = &ftrace_graph_count;
+
+ if (!enable) {
+ table = ftrace_graph_notrace_funcs;
+ count = &ftrace_graph_notrace_count;
+ }
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)
ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
- set_ftrace_early_graph(ftrace_graph_buf);
+ set_ftrace_early_graph(ftrace_graph_buf, 1);
+ if (ftrace_graph_notrace_buf[0])
+ set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)
return 0;
if (ptr == (unsigned long *)1) {
- seq_printf(m, "#### all functions enabled ####\n");
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (fgd->table == ftrace_graph_funcs)
+ seq_printf(m, "#### all functions enabled ####\n");
+ else
+ seq_printf(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- if (function_trace_stop)
- return;
-
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip, regs)) {
- if (WARN_ON(!op->func)) {
- function_trace_stop = 1;
- printk("op=%p %pS\n", op, op);
+ if (FTRACE_WARN_ON(!op->func)) {
+ pr_warn("op=%p %pS\n", op, op);
goto out;
}
op->func(ip, parent_ip, op, regs);
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
/* Function graph doesn't use the .func field of global_ops */
global_ops.flags |= FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Optimize function graph calling (if implemented by arch) */
+ if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+ global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
+#endif
+
ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
global_ops.flags &= ~FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+ global_ops.trampoline = 0;
+#endif
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
-
-void ftrace_graph_stop(void)
-{
- ftrace_stop();
-}
#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ff7027199a9a..925f629658d6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
- /* The update must run on the CPU that is being updated. */
- preempt_disable();
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- preempt_disable();
}
- preempt_enable();
}
/* wait for all the updates to complete */
@@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- preempt_disable();
- /* The update must run on the CPU that is being updated. */
- if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- preempt_disable();
}
- preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -3775,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
- if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ if (iter->head >= rb_page_size(iter->head_page)) {
rb_inc_iter(iter);
goto again;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bda9621638cc..8bb80fe08767 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -823,7 +823,7 @@ static struct {
{ trace_clock_local, "local", 1 },
{ trace_clock_global, "global", 1 },
{ trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 1 },
+ { trace_clock_jiffies, "uptime", 0 },
{ trace_clock, "perf", 1 },
ARCH_TRACE_CLOCKS
};
@@ -937,30 +937,6 @@ out:
return ret;
}
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
-{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
-}
-
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
@@ -3699,6 +3675,7 @@ static const char readme_msg[] =
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
" set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
" max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -4238,10 +4215,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
char buf[64];
int r;
@@ -4253,10 +4229,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
}
static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
unsigned long val;
int ret;
@@ -4269,6 +4244,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+ if (ret < 0)
+ goto out;
+
+ if (tr->current_trace->update_thresh) {
+ ret = tr->current_trace->update_thresh(tr);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = cnt;
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -5170,6 +5191,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
#endif /* CONFIG_TRACER_SNAPSHOT */
+static const struct file_operations tracing_thresh_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_thresh_read,
+ .write = tracing_thresh_write,
+ .llseek = generic_file_llseek,
+};
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -6107,10 +6135,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
if (!topts)
return;
- for (cnt = 0; topts[cnt].opt; cnt++) {
- if (topts[cnt].entry)
- debugfs_remove(topts[cnt].entry);
- }
+ for (cnt = 0; topts[cnt].opt; cnt++)
+ debugfs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6533,7 +6559,7 @@ static __init int tracer_init_debugfs(void)
init_tracer_debugfs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
+ &global_trace, &tracing_thresh_fops);
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -339,6 +339,7 @@ struct tracer_flags {
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @close: called when the trace file is released
@@ -357,6 +358,7 @@ struct tracer {
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
+ int (*update_thresh)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
*/
u64 notrace trace_clock_jiffies(void)
{
- u64 jiffy = jiffies - INITIAL_JIFFIES;
-
- /* Return nsecs */
- return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+ return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0d8ee29f6b9a..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) fmt
+
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
@@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->entry = debugfs_create_dir(name, parent);
if (!dir->entry) {
- pr_warning("Failed to create system directory %s\n", name);
+ pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
@@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create debugfs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
out_fail:
/* Only print this message if failed on memory allocation */
if (!dir || !system)
- pr_warning("No memory to create event subsystem %s\n",
- name);
+ pr_warn("No memory to create event subsystem %s\n", name);
return NULL;
}
@@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
name = ftrace_event_name(call);
file->dir = debugfs_create_dir(name, d_events);
if (!file->dir) {
- pr_warning("Could not create debugfs '%s' directory\n",
- name);
+ pr_warn("Could not create debugfs '%s' directory\n", name);
return -1;
}
@@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
if (list_empty(head)) {
ret = call->class->define_fields(call);
if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", name);
+ pr_warn("Could not initialize trace point events/%s\n",
+ name);
return -1;
}
}
@@ -1648,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)
if (call->class->raw_init) {
ret = call->class->raw_init(call);
if (ret < 0 && ret != -ENOSYS)
- pr_warn("Could not initialize trace events/%s\n",
- name);
+ pr_warn("Could not initialize trace events/%s\n", name);
}
return ret;
@@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(file->event_call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
}
}
@@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create early event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create early event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
entry = debugfs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warning("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create debugfs 'set_event' entry\n");
return -ENOMEM;
}
d_events = debugfs_create_dir("events", parent);
if (!d_events) {
- pr_warning("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create debugfs 'events' directory\n");
return -ENOMEM;
}
@@ -2462,11 +2461,10 @@ static __init int event_trace_init(void)
entry = debugfs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warning("Could not create debugfs "
- "'available_events' entry\n");
+ pr_warn("Could not create debugfs 'available_events' entry\n");
if (trace_define_common_fields())
- pr_warning("tracing: Failed to allocate common fields");
+ pr_warn("tracing: Failed to allocate common fields");
ret = early_event_add_tracer(d_tracer, tr);
if (ret)
@@ -2475,7 +2473,7 @@ static __init int event_trace_init(void)
#ifdef CONFIG_MODULES
ret = register_module_notifier(&trace_module_nb);
if (ret)
- pr_warning("Failed to register trace events module notifier\n");
+ pr_warn("Failed to register trace events module notifier\n");
#endif
return 0;
}
@@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void)
* it and the self test should not be on.
*/
if (file->flags & FTRACE_EVENT_FL_ENABLED) {
- pr_warning("Enabled event during self test!\n");
+ pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
@@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling system %s\n",
- system->name);
+ pr_warn("error enabling system %s\n",
+ system->name);
continue;
}
@@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling system %s\n",
- system->name);
+ pr_warn("error disabling system %s\n",
+ system->name);
continue;
}
@@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling all events\n");
+ pr_warn("error enabling all events\n");
return;
}
@@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void)
/* reset sysname */
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling all events\n");
+ pr_warn("error disabling all events\n");
return;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
#include "trace.h"
#include "trace_output.h"
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
/* When set, irq functions will be ignored */
static int ftrace_graph_skip_irqs;
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
unsigned long long calltime;
int index;
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
if (!current->ret_stack)
return -EBUSY;
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
{
if (tracing_thresh)
return 1;
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
smp_mb();
}
-void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
unregister_ftrace_graph();
}
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+ graph_trace_reset(tr);
+ return graph_trace_init(tr);
+}
+
static int max_bytes_for_cpu;
static enum print_line_t
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
seq_printf(s, " | | | |\n");
}
-void print_graph_headers(struct seq_file *s)
+static void print_graph_headers(struct seq_file *s)
{
print_graph_headers_flags(s, tracer_flags.val);
}
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
+ .update_thresh = graph_trace_update_thresh,
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
-int trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
- int ret;
-
- ret = seq_write(m, s->buffer, len);
-
- /*
- * Only reset this buffer if we successfully wrote to the
- * seq_file buffer.
- */
- if (!ret)
- trace_seq_init(s);
-
- return ret;
-}
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
}
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_printf);
-
-/**
- * trace_seq_bitmask - put a list of longs as a bitmask print output
- * @s: trace sequence descriptor
- * @maskp: points to an array of unsigned longs that represent a bitmask
- * @nmaskbits: The number of bits that are valid in @maskp
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * Writes a ASCII representation of a bitmask string into @s.
- */
-int
-trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
- int nmaskbits)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_bitmask);
-
-/**
- * trace_seq_vprintf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_seq_vprintf);
-
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-int trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-EXPORT_SYMBOL(trace_seq_putc);
-
-int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
-{
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- int i, j;
-
- if (s->full)
- return 0;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-void *trace_seq_reserve(struct trace_seq *s, size_t len)
-{
- void *ret;
-
- if (s->full)
- return NULL;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return NULL;
- }
-
- ret = s->buffer + s->len;
- s->len += len;
-
- return ret;
-}
-
-int trace_seq_path(struct trace_seq *s, const struct path *path)
-{
- unsigned char *p;
-
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
-
- s->full = 1;
- return 0;
-}
-
const char *
ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
{
unsigned long mask;
const char *str;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
for (i = 0; flag_array[i].name && flags; i++) {
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
const struct trace_print_flags *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
trace_seq_putc(p, 0);
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%llx", val);
trace_seq_putc(p, 0);
@@ -430,7 +162,7 @@ const char *
ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
unsigned int bitmask_size)
{
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
@@ -443,7 +175,7 @@ const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
#define SEQ_PUT_FIELD_RET(s, x) \
do { \
if (!trace_seq_putmem(s, &(x), sizeof(x))) \
@@ -46,7 +43,6 @@ do { \
#define SEQ_PUT_HEX_FIELD_RET(s, x) \
do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
return TRACE_TYPE_PARTIAL_LINE; \
} while (0)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ *
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ unsigned int len = TRACE_SEQ_BUF_USED(s);
+ int ret;
+
+ ret = seq_write(m, s->buffer, len);
+
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer. This lets the caller try again or
+ * do something else with the contents.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ va_list ap;
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ unsigned int len = strlen(str);
+
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+
+#define MAX_MEMHEX_BYTES 8U
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ unsigned int len)
+{
+ unsigned char hex[HEX_CHARS];
+ const unsigned char *data = mem;
+ unsigned int start_len;
+ int i, j;
+ int cnt = 0;
+
+ if (s->full)
+ return 0;
+
+ while (len) {
+ start_len = min(len, HEX_CHARS - 1);
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < start_len; i++) {
+#else
+ for (i = start_len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ if (WARN_ON_ONCE(j == 0 || j/2 > len))
+ break;
+
+ /* j increments twice per loop */
+ len -= j / 2;
+ hex[j++] = ' ';
+
+ cnt += trace_seq_putmem(s, hex, j);
+ }
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+ unsigned char *p;
+
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ s->full = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+ int len;
+ int ret;
+
+ if (!cnt)
+ return 0;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret == cnt)
+ return -EFAULT;
+
+ cnt -= ret;
+
+ s->readpos += cnt;
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
static struct kmem_cache *pwq_cache;
-static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
- /*
- * nr_idle and idle_list may disagree if idle rebinding is in
- * progress. Never return %true if idle_list is empty.
- */
- if (list_empty(&pool->idle_list))
- return false;
-
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
return NULL;
/*
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
*
- * Set @flags in @worker->flags and adjust nr_running accordingly. If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* spin_lock_irq(pool->lock)
*/
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
- bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
- /*
- * If transitioning into NOT_RUNNING, adjust nr_running and
- * wake up an idle worker as necessary if requested by
- * @wakeup.
- */
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- if (wakeup) {
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- wake_up_worker(pool);
- } else
- atomic_dec(&pool->nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
pwq_activate_delayed_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, get_work_color(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
(worker->hentry.next || worker->hentry.pprev)))
return;
- /* can't use worker_set_flags(), also called from start_worker() */
+ /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
{
struct worker *worker;
- worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,
detach_completion = pool->detach_completion;
mutex_unlock(&pool->attach_mutex);
+ /* clear leftover flags without pool->lock after it is detached */
+ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
if (detach_completion)
complete(detach_completion);
}
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
- * Create a new worker which is attached to @pool. The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)
if (id < 0)
goto fail;
- worker = alloc_worker();
+ worker = alloc_worker(pool->node);
if (!worker)
goto fail;
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
+ /* start the newly created worker */
+ spin_lock_irq(&pool->lock);
+ worker->pool->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+ spin_unlock_irq(&pool->lock);
+
return worker;
fail:
@@ -1734,44 +1722,6 @@ fail:
}
/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
- worker->pool->nr_workers++;
- worker_enter_idle(worker);
- wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
- }
-
- return worker ? 0 : -ENOMEM;
-}
-
-/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
@@ -1909,23 +1859,10 @@ restart:
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- if (WARN_ON_ONCE(need_to_create_worker(pool)))
- goto restart;
- return true;
- }
-
- if (!need_to_create_worker(pool))
+ if (create_worker(pool) || !need_to_create_worker(pool))
break;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(CREATE_COOLDOWN);
+ schedule_timeout_interruptible(CREATE_COOLDOWN);
if (!need_to_create_worker(pool))
break;
@@ -1933,6 +1870,11 @@ restart:
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
+ /*
+ * This is necessary even after a new worker was just successfully
+ * created as @pool->lock was dropped and the new worker might have
+ * already become busy.
+ */
if (need_to_create_worker(pool))
goto restart;
return true;
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
- /*
- * Ensure we're on the correct CPU. DISASSOCIATED test is
- * necessary to avoid spurious warnings from rescuers servicing the
- * unbound or a disassociated pool.
- */
- WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
- !(pool->flags & POOL_DISASSOCIATED) &&
+ /* ensure we're on the correct CPU */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock)
list_del_init(&work->entry);
/*
- * CPU intensive works don't participate in concurrency
- * management. They're the scheduler's responsibility.
+ * CPU intensive works don't participate in concurrency management.
+ * They're the scheduler's responsibility. This takes @worker out
+ * of concurrency management and the next code block will chain
+ * execution of the pending work items.
*/
if (unlikely(cpu_intensive))
- worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Unbound pool isn't concurrency managed and work items should be
- * executed ASAP. Wake up another worker if necessary.
+ * Wake up another worker if necessary. The condition is always
+ * false for normal per-cpu workers since nr_running would always
+ * be >= 1 at this point. This is used to chain execution of the
+ * pending work items for WORKER_NOT_RUNNING workers such as the
+ * UNBOUND and CPU_INTENSIVE ones.
*/
- if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
/*
@@ -2218,7 +2160,7 @@ recheck:
}
} while (keep_working(pool));
- worker_set_flags(worker, WORKER_PREP, false);
+ worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
@@ -2311,29 +2253,27 @@ repeat:
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
- spin_unlock_irq(&pool->lock);
-
- worker_detach_from_pool(rescuer, pool);
-
- spin_lock_irq(&pool->lock);
/*
* Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're holding its lock.
+ * go away while we're still attached to it.
*/
put_pwq(pwq);
/*
- * Leave this pool. If keep_working() is %true, notify a
+ * Leave this pool. If need_more_worker() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
rescuer->pool = NULL;
- spin_unlock(&pool->lock);
- spin_lock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&wq_mayday_lock);
}
spin_unlock_irq(&wq_mayday_lock);
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
return;
/* sanity checks */
- if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ if (WARN_ON(!(pool->cpu < 0)) ||
WARN_ON(!list_empty(&pool->worklist)))
return;
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool->attrs, attrs)) {
pool->refcnt++;
- goto out_unlock;
+ return pool;
}
}
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
goto fail;
/* create and start the initial worker */
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
+
return pool;
fail:
if (pool)
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
- /*
- * Unlink @pwq. Synchronization against wq->mutex isn't strictly
- * necessary on release but do it anyway. It's easier to verify
- * and consistent with the linking path.
- */
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
if (!list_empty(&pwq->pwqs_node))
return;
- /*
- * Set the matching work_color. This is synchronized with
- * wq->mutex to avoid confusing flush_workqueue().
- */
+ /* set the matching work_color */
pwq->work_color = wq->work_color;
/* sync max_active to the current setting */
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;
- pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
- tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+ tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- rescuer = alloc_worker();
+ rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
goto err_destroy;
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
- WARN_ON_ONCE(cpu != smp_processor_id());
-
mutex_lock(&pool->attach_mutex);
spin_lock_irq(&pool->lock);
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_pool(pool, pi) {
mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu) {
- spin_lock_irq(&pool->lock);
- pool->flags &= ~POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
-
+ if (pool->cpu == cpu)
rebind_workers(pool);
- } else if (pool->cpu < 0) {
+ else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- }
mutex_unlock(&pool->attach_mutex);
}
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)
cpumask_var_t *tbl;
int node, cpu;
- /* determine NUMA pwq table len - highest node id + 1 */
- for_each_node(node)
- wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
if (num_possible_nodes() <= 1)
return;
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
- tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
- BUG_ON(create_and_start_worker(pool) < 0);
+ BUG_ON(!create_worker(pool));
}
}