diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 130 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/printk.c | 159 | ||||
-rw-r--r-- | kernel/sched/core.c | 92 | ||||
-rw-r--r-- | kernel/sched/fair.c | 113 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 | ||||
-rw-r--r-- | kernel/trace/trace.c | 7 | ||||
-rw-r--r-- | kernel/trace/trace_functions.c | 36 |
8 files changed, 425 insertions, 137 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd12..f33c7153b6d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + +/* + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) @@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/exit.c b/kernel/exit.c index d17f6c4ddfa9..f65345f9e5bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -483,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; diff --git a/kernel/printk.c b/kernel/printk.c index ac4bc9e79465..50c96b5651b6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -216,6 +216,7 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -228,14 +229,19 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -360,6 +366,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -425,6 +432,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; @@ -462,8 +470,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - (msg->facility << 3) | msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -646,6 +671,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif @@ -876,7 +910,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; if (prefix) @@ -907,7 +941,7 @@ static int syslog_print(char __user *buf, int size) struct log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -930,7 +964,8 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -969,7 +1004,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1022,7 +1057,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; @@ -1349,20 +1385,36 @@ static struct cont { u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(void) +static void cont_flush(enum log_flags flags) { if (cont.flushed) return; if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); - - cont.flushed = true; + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } } static bool cont_add(int facility, int level, const char *text, size_t len) @@ -1371,7 +1423,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len) return false; if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); return false; } @@ -1380,12 +1433,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); + cont.flags = 0; cont.cons = 0; cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + return true; } @@ -1394,7 +1452,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0) { + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1409,7 +1467,8 @@ static size_t cont_print_text(char *text, size_t size) } if (cont.flushed) { - text[textlen++] = '\n'; + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; /* got everything, release buffer */ cont.len = 0; } @@ -1507,7 +1566,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(); + cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1525,7 +1584,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(); + cont_flush(LOG_NEWLINE); } if (!stored) @@ -1616,9 +1675,20 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 #define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; +static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1902,10 +1972,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1923,7 +2017,7 @@ static enum log_flags console_prev; */ void console_unlock(void) { - static char text[LOG_LINE_MAX]; + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1937,19 +2031,7 @@ void console_unlock(void) console_may_schedule = 0; /* flush buffered message fragment immediately to console */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (cont.len && (cont.cons < cont.len || cont.flushed)) { - size_t len; - - len = cont_print_text(text, sizeof(text)); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; @@ -1986,6 +2068,7 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; goto skip; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1ba..5d011ef4c0df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; @@ -7097,34 +7134,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(true); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); - return NOTIFY_OK; + cpuset_update_active_cpus(false); + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) @@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..22321db64952 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } @@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; unsigned int flags; @@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -4267,6 +4281,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4284,7 +4299,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4296,14 +4317,52 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435a..c35a1a7dd4d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a120f98c4112..5c38c81496ce 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..a426f410c060 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/pstore.h> #include <linux/fs.h> #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, +#endif { } /* Always set a last empty entry */ }; @@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = |