diff options
Diffstat (limited to 'kernel/cgroup')
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 14 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 5 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 13 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 8 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 198 |
5 files changed, 118 insertions, 120 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..2a4a387f867a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) */ for_each_subsys(ss, i) { - if (cgroup1_subsys_absent(ss)) - continue; cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), @@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a723b7dc6e4e..312c6a8b55bb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); +#ifdef CONFIG_CGROUP_BPF + for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) + cgrp->bpf.revisions[i] = 1; +#endif + init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3bc4301466f3..27adb04df675 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -280,7 +280,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); @@ -1843,7 +1843,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - } else if (is_partition_invalid(cs) && + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; @@ -3358,14 +3358,12 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - css_put(&cs->css); return retval ?: nbytes; } @@ -3870,9 +3868,10 @@ retry: partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned - * back to a regular one. + * back to a regular one with a non-empty effective xcpus. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { @@ -4051,7 +4050,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 507b8f19a262..dd9417425d92 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; - unsigned int state; rcu_read_lock(); - /* Check if the cgroup is still FREEZING, but not FROZEN. The extra - * !FROZEN check is required, because the FREEZING bit is not cleared - * when the state FROZEN is reached. - */ - state = task_freezer(task)->state; - ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index cbeaa499a96a..a198e40c799b 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -10,7 +10,7 @@ #include <trace/events/cgroup.h> static DEFINE_SPINLOCK(rstat_base_lock); -static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) return &rstat_base_lock; } -static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) { - if (ss) { - /* - * Depending on config, the subsystem per-cpu lock type may be an - * empty struct. In enviromnents where this is the case, allocation - * of this field is not performed in ss_rstat_init(). Avoid a - * cpu-based offset relative to NULL by returning early. When the - * lock type is zero in size, the corresponding lock functions are - * no-ops so passing them NULL is acceptable. - */ - if (sizeof(*ss->rstat_ss_cpu_lock) == 0) - return NULL; - - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); - } - - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); -} - -/* - * Helper functions for rstat per CPU locks. - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. - */ -static __always_inline -unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, - const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - unsigned long flags; - bool contended; - - /* - * The _irqsave() is needed because the locks used for flushing are - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT - * kernel. The raw_spinlock_t below disables interrupts on both - * configurations. The _irqsave() ensures that interrupts are always - * disabled and later restored. - */ - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); - - raw_spin_lock_irqsave(cpu_lock, flags); - } - - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); - - return flags; -} - -static __always_inline -void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, - unsigned long flags, const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - raw_spin_unlock_irqrestore(cpu_lock, flags); + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); } /** @@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * css_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct css_rstat_cpu __percpu *rstatc_pcpu; + struct llist_node *self; /* * Since bpf programs can call this function, prevent access to @@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) if (!css_uses_rstat(css)) return; + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ + if (llist_on_list(&rstatc->lnode)) + return; + + /* + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @css is on the list by - * testing the next pointer for NULL. + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. */ - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) + self = &rstatc->lnode; + rstatc_pcpu = css->rstat_cpu; + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; - flags = _css_rstat_cpu_lock(css, cpu, true); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); @@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) css = parent; } +} - _css_rstat_cpu_unlock(css, cpu, flags, true); +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; + + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; + + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ + + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** @@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list( { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; - unsigned long flags; - flags = _css_rstat_cpu_lock(root, cpu, false); + css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is @@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list( rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); -unlock_ret: - _css_rstat_cpu_unlock(root, cpu, flags, false); + return head; } @@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css) for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - rstatc->updated_children = css; + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; @@ -488,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css) if (!css_uses_rstat(css)) return; + if (!css->rstat_cpu) + return; + css_rstat_flush(css); /* sanity check */ @@ -522,19 +516,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; - /* - * Depending on config, the subsystem per-cpu lock type may be an empty - * struct. Avoid allocating a size of zero in this case. - */ - if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); - if (!ss->rstat_ss_cpu_lock) + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) return -ENOMEM; } spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); return 0; } |