summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c130
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/printk.c159
-rw-r--r--kernel/sched/core.c92
-rw-r--r--kernel/sched/fair.c113
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/trace/trace.c7
-rw-r--r--kernel/trace/trace_functions.c36
8 files changed, 425 insertions, 137 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
+/* the type of hotplug event */
+enum hotplug_event {
+ CPUSET_CPU_OFFLINE,
+ CPUSET_MEM_OFFLINE,
+};
+
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
/*
- * Walk the specified cpuset subtree and look for empty cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * Helper function to traverse cpusets.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+ struct cpuset *cp;
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct cgroup *cont;
+
+ if (list_empty(queue))
+ return NULL;
+
+ cp = list_first_entry(queue, struct cpuset, stack_list);
+ list_del(queue->next);
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, queue);
+ }
+
+ return cp;
+}
+
+
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
*
* Called with cgroup_mutex held. We take callback_mutex to modify
* cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* before dropping down to the next. It always processes a node before
* any of its children.
*
- * For now, since we lack memory hot unplug, we'll never see a cpuset
- * that has tasks along with an empty 'mems'. But if we did see such
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
+ * if all present pages from a node are offlined.
*/
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
{
LIST_HEAD(queue);
- struct cpuset *cp; /* scans cpusets being updated */
- struct cpuset *child; /* scans child cpusets of cp */
- struct cgroup *cont;
+ struct cpuset *cp; /* scans cpusets being updated */
static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
- while (!list_empty(&queue)) {
- cp = list_first_entry(&queue, struct cpuset, stack_list);
- list_del(queue.next);
- list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
- child = cgroup_cs(cont);
- list_add_tail(&child->stack_list, &queue);
+ switch (event) {
+ case CPUSET_CPU_OFFLINE:
+ while ((cp = cpuset_next(&queue)) != NULL) {
+
+ /* Continue past cpusets with all cpus online */
+ if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+ continue;
+
+ /* Remove offline cpus from this cpuset. */
+ mutex_lock(&callback_mutex);
+ cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+ cpu_active_mask);
+ mutex_unlock(&callback_mutex);
+
+ /* Move tasks from the empty cpuset to a parent */
+ if (cpumask_empty(cp->cpus_allowed))
+ remove_tasks_in_empty_cpuset(cp);
+ else
+ update_tasks_cpumask(cp, NULL);
}
+ break;
- /* Continue past cpusets with all cpus, mems online */
- if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
- nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
- continue;
+ case CPUSET_MEM_OFFLINE:
+ while ((cp = cpuset_next(&queue)) != NULL) {
- oldmems = cp->mems_allowed;
+ /* Continue past cpusets with all mems online */
+ if (nodes_subset(cp->mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ continue;
- /* Remove offline cpus and mems from this cpuset. */
- mutex_lock(&callback_mutex);
- cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
- cpu_active_mask);
- nodes_and(cp->mems_allowed, cp->mems_allowed,
+ oldmems = cp->mems_allowed;
+
+ /* Remove offline mems from this cpuset. */
+ mutex_lock(&callback_mutex);
+ nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_HIGH_MEMORY]);
- mutex_unlock(&callback_mutex);
+ mutex_unlock(&callback_mutex);
- /* Move tasks from the empty cpuset to a parent */
- if (cpumask_empty(cp->cpus_allowed) ||
- nodes_empty(cp->mems_allowed))
- remove_tasks_in_empty_cpuset(cp);
- else {
- update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems, NULL);
+ /* Move tasks from the empty cpuset to a parent */
+ if (nodes_empty(cp->mems_allowed))
+ remove_tasks_in_empty_cpuset(cp);
+ else
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* (of no affect) on systems that are actively using CPU hotplug
* but making no active use of cpusets.
*
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
*/
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
- scan_for_empty_cpusets(&top_cpuset);
+
+ if (!cpu_online)
+ scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
*/
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
case MEM_OFFLINE:
/*
* needn't update top_cpuset.mems_allowed explicitly because
- * scan_for_empty_cpusets() will update it.
+ * scan_cpusets_upon_hotplug() will update it.
*/
- scan_for_empty_cpusets(&top_cpuset);
+ scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
break;
default:
break;
diff --git a/kernel/exit.c b/kernel/exit.c
index d17f6c4ddfa9..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
rcu_read_unlock();
for (;;) {
unsigned long set;
- i = j * __NFDBITS;
+ i = j * BITS_PER_LONG;
if (i >= fdt->max_fds)
break;
set = fdt->open_fds[j++];
diff --git a/kernel/printk.c b/kernel/printk.c
index ac4bc9e79465..50c96b5651b6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
*/
static DEFINE_RAW_SPINLOCK(logbuf_lock);
+#ifdef CONFIG_PRINTK
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
/* index and sequence number of the next record to store in the buffer */
static u64 log_next_seq;
-#ifdef CONFIG_PRINTK
static u32 log_next_idx;
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
+
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
static u32 clear_idx;
-#define LOG_LINE_MAX 1024
+#define PREFIX_MAX 32
+#define LOG_LINE_MAX 1024 - PREFIX_MAX
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
struct devkmsg_user {
u64 seq;
u32 idx;
+ enum log_flags prev;
struct mutex lock;
char buf[8192];
};
@@ -425,6 +432,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
struct log *msg;
u64 ts_usec;
size_t i;
+ char cont = '-';
size_t len;
ssize_t ret;
@@ -462,8 +470,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
msg = log_from_idx(user->idx);
ts_usec = msg->ts_nsec;
do_div(ts_usec, 1000);
- len = sprintf(user->buf, "%u,%llu,%llu;",
- (msg->facility << 3) | msg->level, user->seq, ts_usec);
+
+ /*
+ * If we couldn't merge continuation line fragments during the print,
+ * export the stored flags to allow an optional external merge of the
+ * records. Merging the records isn't always neccessarily correct, like
+ * when we hit a race during printing. In most cases though, it produces
+ * better readable output. 'c' in the record flags mark the first
+ * fragment of a line, '+' the following.
+ */
+ if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+ cont = 'c';
+ else if ((msg->flags & LOG_CONT) ||
+ ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+ cont = '+';
+
+ len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+ (msg->facility << 3) | msg->level,
+ user->seq, ts_usec, cont);
+ user->prev = msg->flags;
/* escape non-printable characters */
for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +671,15 @@ void log_buf_kexec_setup(void)
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(log_first_idx);
VMCOREINFO_SYMBOL(log_next_idx);
+ /*
+ * Export struct log size and field offsets. User space tools can
+ * parse it and detect any changes to structure down the line.
+ */
+ VMCOREINFO_STRUCT_SIZE(log);
+ VMCOREINFO_OFFSET(log, ts_nsec);
+ VMCOREINFO_OFFSET(log, len);
+ VMCOREINFO_OFFSET(log, text_len);
+ VMCOREINFO_OFFSET(log, dict_len);
}
#endif
@@ -876,7 +910,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
if (buf) {
if (print_prefix(msg, syslog, NULL) +
- text_len + 1>= size - len)
+ text_len + 1 >= size - len)
break;
if (prefix)
@@ -907,7 +941,7 @@ static int syslog_print(char __user *buf, int size)
struct log *msg;
int len = 0;
- text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -930,7 +964,8 @@ static int syslog_print(char __user *buf, int size)
skip = syslog_partial;
msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+ n = msg_print_text(msg, syslog_prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
syslog_idx = log_next(syslog_idx);
@@ -969,7 +1004,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
char *text;
int len = 0;
- text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
@@ -1022,7 +1057,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct log *msg = log_from_idx(idx);
int textlen;
- textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
+ textlen = msg_print_text(msg, prev, true, text,
+ LOG_LINE_MAX + PREFIX_MAX);
if (textlen < 0) {
len = textlen;
break;
@@ -1349,20 +1385,36 @@ static struct cont {
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log level of first message */
+ enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
-static void cont_flush(void)
+static void cont_flush(enum log_flags flags)
{
if (cont.flushed)
return;
if (cont.len == 0)
return;
- log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
- NULL, 0, cont.buf, cont.len);
-
- cont.flushed = true;
+ if (cont.cons) {
+ /*
+ * If a fragment of this line was directly flushed to the
+ * console; wait for the console to pick up the rest of the
+ * line. LOG_NOCONS suppresses a duplicated output.
+ */
+ log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+ cont.flags = flags;
+ cont.flushed = true;
+ } else {
+ /*
+ * If no fragment of this line ever reached the console,
+ * just submit it to the store and free the buffer.
+ */
+ log_store(cont.facility, cont.level, flags, 0,
+ NULL, 0, cont.buf, cont.len);
+ cont.len = 0;
+ }
}
static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1371,7 +1423,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
return false;
if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
+ /* the line gets too long, split it up in separate records */
+ cont_flush(LOG_CONT);
return false;
}
@@ -1380,12 +1433,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
cont.level = level;
cont.owner = current;
cont.ts_nsec = local_clock();
+ cont.flags = 0;
cont.cons = 0;
cont.flushed = false;
}
memcpy(cont.buf + cont.len, text, len);
cont.len += len;
+
+ if (cont.len > (sizeof(cont.buf) * 80) / 100)
+ cont_flush(LOG_CONT);
+
return true;
}
@@ -1394,7 +1452,7 @@ static size_t cont_print_text(char *text, size_t size)
size_t textlen = 0;
size_t len;
- if (cont.cons == 0) {
+ if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
textlen += print_time(cont.ts_nsec, text);
size -= textlen;
}
@@ -1409,7 +1467,8 @@ static size_t cont_print_text(char *text, size_t size)
}
if (cont.flushed) {
- text[textlen++] = '\n';
+ if (cont.flags & LOG_NEWLINE)
+ text[textlen++] = '\n';
/* got everything, release buffer */
cont.len = 0;
}
@@ -1507,7 +1566,7 @@ asmlinkage int vprintk_emit(int facility, int level,
* or another task also prints continuation lines.
*/
if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
- cont_flush();
+ cont_flush(LOG_NEWLINE);
/* buffer line if possible, otherwise store it right away */
if (!cont_add(facility, level, text, text_len))
@@ -1525,7 +1584,7 @@ asmlinkage int vprintk_emit(int facility, int level,
if (cont.len && cont.owner == current) {
if (!(lflags & LOG_PREFIX))
stored = cont_add(facility, level, text, text_len);
- cont_flush();
+ cont_flush(LOG_NEWLINE);
}
if (!stored)
@@ -1616,9 +1675,20 @@ asmlinkage int printk(const char *fmt, ...)
}
EXPORT_SYMBOL(printk);
-#else
+#else /* CONFIG_PRINTK */
+#define LOG_LINE_MAX 0
+#define PREFIX_MAX 0
#define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
static struct cont {
size_t len;
size_t cons;
@@ -1902,10 +1972,34 @@ void wake_up_klogd(void)
this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
}
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags console_prev;
+static void console_cont_flush(char *text, size_t size)
+{
+ unsigned long flags;
+ size_t len;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+
+ if (!cont.len)
+ goto out;
+
+ /*
+ * We still queue earlier records, likely because the console was
+ * busy. The earlier ones need to be printed before this one, we
+ * did not flush any fragment so far, so just let it queue up.
+ */
+ if (console_seq < log_next_seq && !cont.cons)
+ goto out;
+
+ len = cont_print_text(text, size);
+ raw_spin_unlock(&logbuf_lock);
+ stop_critical_timings();
+ call_console_drivers(cont.level, text, len);
+ start_critical_timings();
+ local_irq_restore(flags);
+ return;
+out:
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
/**
* console_unlock - unlock the console system
@@ -1923,7 +2017,7 @@ static enum log_flags console_prev;
*/
void console_unlock(void)
{
- static char text[LOG_LINE_MAX];
+ static char text[LOG_LINE_MAX + PREFIX_MAX];
static u64 seen_seq;
unsigned long flags;
bool wake_klogd = false;
@@ -1937,19 +2031,7 @@ void console_unlock(void)
console_may_schedule = 0;
/* flush buffered message fragment immediately to console */
- raw_spin_lock_irqsave(&logbuf_lock, flags);
- if (cont.len && (cont.cons < cont.len || cont.flushed)) {
- size_t len;
-
- len = cont_print_text(text, sizeof(text));
- raw_spin_unlock(&logbuf_lock);
- stop_critical_timings();
- call_console_drivers(cont.level, text, len);
- start_critical_timings();
- local_irq_restore(flags);
- } else
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-
+ console_cont_flush(text, sizeof(text));
again:
for (;;) {
struct log *msg;
@@ -1986,6 +2068,7 @@ skip:
* will properly dump everything later.
*/
msg->flags &= ~LOG_NOCONS;
+ console_prev = msg->flags;
goto skip;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..5d011ef4c0df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traverse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right ? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7134,66 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
+ cpuset_update_active_cpus(false);
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
void __init sched_init_smp(void)
@@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..22321db64952 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu;
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ for_each_lower_domain(sd) {
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}
@@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
+#define LBF_SOME_PINNED 0x04
struct lb_env {
struct sched_domain *sd;
- int src_cpu;
struct rq *src_rq;
+ int src_cpu;
int dst_cpu;
struct rq *dst_rq;
+ struct cpumask *dst_grpmask;
+ int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
unsigned int flags;
@@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ int new_dst_cpu;
+
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+
+ /*
+ * Remember if this task can be migrated to any other cpu in
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+ * Also avoid computing new_dst_cpu if we have already computed
+ * one in current iteration.
+ */
+ if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+ return 0;
+
+ new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+ tsk_cpus_allowed(p));
+ if (new_dst_cpu < nr_cpu_ids) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = new_dst_cpu;
+ }
return 0;
}
+
+ /* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
@@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, active_balance = 0;
+ int ld_moved, cur_ld_moved, active_balance = 0;
+ int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
@@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
+ .dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
};
cpumask_copy(cpus, cpu_active_mask);
+ max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
@@ -4267,6 +4281,7 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
+ lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4299,13 @@ more_balance:
double_rq_lock(this_rq, busiest);
if (!env.loop)
update_h_load(env.src_cpu);
- ld_moved += move_tasks(&env);
+
+ /*
+ * cur_ld_moved - load moved in current iteration
+ * ld_moved - cumulative load moved across iterations
+ */
+ cur_ld_moved = move_tasks(&env);
+ ld_moved += cur_ld_moved;
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -4296,14 +4317,52 @@ more_balance:
/*
* some other cpu did the load balance for us.
*/
- if (ld_moved && this_cpu != smp_processor_id())
- resched_cpu(this_cpu);
+ if (cur_ld_moved && env.dst_cpu != smp_processor_id())
+ resched_cpu(env.dst_cpu);
+
+ /*
+ * Revisit (affine) tasks on src_cpu that couldn't be moved to
+ * us and move them to an alternate dst_cpu in our sched_group
+ * where they can run. The upper limit on how many times we
+ * iterate on same src_cpu is dependent on number of cpus in our
+ * sched_group.
+ *
+ * This changes load balance semantics a bit on who can move
+ * load to a given_cpu. In addition to the given_cpu itself
+ * (or a ilb_cpu acting on its behalf where given_cpu is
+ * nohz-idle), we now have balance_cpu in a position to move
+ * load to given_cpu. In rare situations, this may cause
+ * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+ * _independently_ and at _same_ time to move some load to
+ * given_cpu) causing exceess load to be moved to given_cpu.
+ * This however should not happen so much in practice and
+ * moreover subsequent load balance cycles should correct the
+ * excess load moved.
+ */
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+ lb_iterations++ < max_lb_iterations) {
+
+ this_rq = cpu_rq(env.new_dst_cpu);
+ env.dst_rq = this_rq;
+ env.dst_cpu = env.new_dst_cpu;
+ env.flags &= ~LBF_SOME_PINNED;
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ /*
+ * Go back to "more_balance" rather than "redo" since we
+ * need to continue with same src_cpu.
+ */
+ goto more_balance;
+ }
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus))
+ if (!cpumask_empty(cpus)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
goto redo;
+ }
goto out_balanced;
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a120f98c4112..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
}
destroy_trace_option_files(topts);
- current_trace = t;
+ current_trace = &nop_trace;
- topts = create_trace_option_files(current_trace);
- if (current_trace->use_max_tr) {
+ topts = create_trace_option_files(t);
+ if (t->use_max_tr) {
int cpu;
/* we need to make per cpu buffer sizes equivalent */
for_each_tracing_cpu(cpu) {
@@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
+ current_trace = t;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db09..a426f410c060 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/pstore.h>
#include <linux/fs.h>
#include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
preempt_enable_notrace();
}
+/* Our two options */
+enum {
+ TRACE_FUNC_OPT_STACK = 0x1,
+ TRACE_FUNC_OPT_PSTORE = 0x2,
+};
+
+static struct tracer_flags func_flags;
+
static void
function_trace_call(unsigned long ip, unsigned long parent_ip)
{
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
+ /*
+ * So far tracing doesn't support multiple buffers, so
+ * we make an explicit call for now.
+ */
+ if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
+ pstore_ftrace_call(ip, parent_ip);
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
.flags = FTRACE_OPS_FL_GLOBAL,
};
-/* Our two options */
-enum {
- TRACE_FUNC_OPT_STACK = 0x1,
-};
-
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
+#ifdef CONFIG_PSTORE_FTRACE
+ { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
+#endif
{ } /* Always set a last empty entry */
};
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
static int func_set_flag(u32 old_flags, u32 bit, int set)
{
- if (bit == TRACE_FUNC_OPT_STACK) {
+ switch (bit) {
+ case TRACE_FUNC_OPT_STACK:
/* do nothing if already set */
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
- return 0;
+ break;
if (set) {
unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
register_ftrace_function(&trace_ops);
}
- return 0;
+ break;
+ case TRACE_FUNC_OPT_PSTORE:
+ break;
+ default:
+ return -EINVAL;
}
- return -EINVAL;
+ return 0;
}
static struct tracer function_trace __read_mostly =