summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/capability.c111
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/kgdb.c16
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched.c468
-rw-r--r--kernel/sched_clock.c18
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c254
-rw-r--r--kernel/sched_rt.c4
-rw-r--r--kernel/sched_stats.h1
-rw-r--r--kernel/sched_trace.h41
-rw-r--r--kernel/signal.c51
-rw-r--r--kernel/stop_machine.c7
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c457
-rw-r--r--kernel/trace/trace.c153
-rw-r--r--kernel/trace/trace.h18
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_mmiotrace.c295
-rw-r--r--kernel/trace/trace_sched_switch.c45
-rw-r--r--kernel/trace/trace_sched_wakeup.c67
-rw-r--r--kernel/trace/trace_selftest.c1
27 files changed, 1213 insertions, 865 deletions
diff --git a/kernel/capability.c b/kernel/capability.c
index 39e8193b41ea..cfbe44299488 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -53,6 +53,69 @@ static void warn_legacy_capability_use(void)
}
/*
+ * Version 2 capabilities worked fine, but the linux/capability.h file
+ * that accompanied their introduction encouraged their use without
+ * the necessary user-space source code changes. As such, we have
+ * created a version 3 with equivalent functionality to version 2, but
+ * with a header change to protect legacy source code from using
+ * version 2 when it wanted to use version 1. If your system has code
+ * that trips the following warning, it is using version 2 specific
+ * capabilities and may be doing so insecurely.
+ *
+ * The remedy is to either upgrade your version of libcap (to 2.10+,
+ * if the application is linked against it), or recompile your
+ * application with modern kernel headers and this warning will go
+ * away.
+ */
+
+static void warn_deprecated_v2(void)
+{
+ static int warned;
+
+ if (!warned) {
+ char name[sizeof(current->comm)];
+
+ printk(KERN_INFO "warning: `%s' uses deprecated v2"
+ " capabilities in a way that may be insecure.\n",
+ get_task_comm(name, current));
+ warned = 1;
+ }
+}
+
+/*
+ * Version check. Return the number of u32s in each capability flag
+ * array, or a negative value on error.
+ */
+static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
+{
+ __u32 version;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ switch (version) {
+ case _LINUX_CAPABILITY_VERSION_1:
+ warn_legacy_capability_use();
+ *tocopy = _LINUX_CAPABILITY_U32S_1;
+ break;
+ case _LINUX_CAPABILITY_VERSION_2:
+ warn_deprecated_v2();
+ /*
+ * fall through - v3 is otherwise equivalent to v2.
+ */
+ case _LINUX_CAPABILITY_VERSION_3:
+ *tocopy = _LINUX_CAPABILITY_U32S_3;
+ break;
+ default:
+ if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
* For sys_getproccap() and sys_setproccap(), any of the three
* capability set pointers may be NULL -- indicating that that set is
* uninteresting and/or not to be changed.
@@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
{
int ret = 0;
pid_t pid;
- __u32 version;
struct task_struct *target;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
- if (get_user(version, &header->version))
- return -EFAULT;
-
- switch (version) {
- case _LINUX_CAPABILITY_VERSION_1:
- warn_legacy_capability_use();
- tocopy = _LINUX_CAPABILITY_U32S_1;
- break;
- case _LINUX_CAPABILITY_VERSION_2:
- tocopy = _LINUX_CAPABILITY_U32S_2;
- break;
- default:
- if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
- return -EFAULT;
- return -EINVAL;
- }
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
@@ -118,7 +167,7 @@ out:
spin_unlock(&task_capability_lock);
if (!ret) {
- struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
@@ -128,7 +177,7 @@ out:
}
/*
- * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
@@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective,
*/
asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
- struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy;
kernel_cap_t inheritable, permitted, effective;
- __u32 version;
struct task_struct *target;
int ret;
pid_t pid;
- if (get_user(version, &header->version))
- return -EFAULT;
-
- switch (version) {
- case _LINUX_CAPABILITY_VERSION_1:
- warn_legacy_capability_use();
- tocopy = _LINUX_CAPABILITY_U32S_1;
- break;
- case _LINUX_CAPABILITY_VERSION_2:
- tocopy = _LINUX_CAPABILITY_U32S_2;
- break;
- default:
- if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
- return -EFAULT;
- return -EINVAL;
- }
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
@@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
- while (i < _LINUX_CAPABILITY_U32S) {
+ while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fbc6fc8949b4..15ac0e1e4f4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
cg = tsk->cgroups;
parent = task_cgroup(tsk, subsys->subsys_id);
- snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
+ snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
/* Pin the hierarchy */
atomic_inc(&parent->root->sb->s_active);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 86ea9e34e326..039baa4cd90c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -797,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
retval = cpulist_parse(buf, trialcs.cpus_allowed);
if (retval < 0)
return retval;
+
+ if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+ return -EINVAL;
}
- cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
retval = validate_change(cs, &trialcs);
if (retval < 0)
return retval;
@@ -932,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
goto done;
+
+ if (!nodes_subset(trialcs.mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ return -EINVAL;
}
- nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
- node_states[N_HIGH_MEMORY]);
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
diff --git a/kernel/exit.c b/kernel/exit.c
index 1510f78a0ffa..8f6185e69b69 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -126,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk)
__unhash_process(tsk);
+ /*
+ * Do this under ->siglock, we can race with another thread
+ * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
+ */
+ flush_sigqueue(&tsk->pending);
+
tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
@@ -133,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk)
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- flush_sigqueue(&tsk->pending);
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 14787de568b3..79e3c90113c2 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -52,6 +52,7 @@
#include <asm/byteorder.h>
#include <asm/atomic.h>
#include <asm/system.h>
+#include <asm/unaligned.h>
static int kgdb_break_asap;
@@ -227,8 +228,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
* GDB remote protocol parser:
*/
-static const char hexchars[] = "0123456789abcdef";
-
static int hex(char ch)
{
if ((ch >= 'a') && (ch <= 'f'))
@@ -316,8 +315,8 @@ static void put_packet(char *buffer)
}
kgdb_io_ops->write_char('#');
- kgdb_io_ops->write_char(hexchars[checksum >> 4]);
- kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
+ kgdb_io_ops->write_char(hex_asc_hi(checksum));
+ kgdb_io_ops->write_char(hex_asc_lo(checksum));
if (kgdb_io_ops->flush)
kgdb_io_ops->flush();
@@ -478,8 +477,8 @@ static void error_packet(char *pkt, int error)
{
error = -error;
pkt[0] = 'E';
- pkt[1] = hexchars[(error / 10)];
- pkt[2] = hexchars[(error % 10)];
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
pkt[3] = '\0';
}
@@ -510,10 +509,7 @@ static void int_to_threadref(unsigned char *id, int value)
scan = (unsigned char *)id;
while (i--)
*scan++ = 0;
- *scan++ = (value >> 24) & 0xff;
- *scan++ = (value >> 16) & 0xff;
- *scan++ = (value >> 8) & 0xff;
- *scan++ = (value & 0xff);
+ put_unaligned_be32(value, scan);
}
static struct task_struct *getthread(struct pt_regs *regs, int tid)
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491ef7ac..5f80478b746d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1337,7 +1337,19 @@ out_unreg:
kobject_put(&mod->mkobj.kobj);
return err;
}
-#endif
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ kobject_put(&mod->mkobj.kobj);
+}
+
+#else /* CONFIG_SYSFS */
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+#endif /* CONFIG_SYSFS */
static void mod_kobject_remove(struct module *mod)
{
@@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod)
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
kobject_put(mod->holders_dir);
- kobject_put(&mod->mkobj.kobj);
+ mod_sysfs_fini(mod);
}
/*
@@ -1780,7 +1792,7 @@ static struct module *load_module(void __user *umod,
/* Sanity checks against insmoding binaries or wrong arch,
weird elf version */
- if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
+ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
|| hdr->e_type != ET_REL
|| !elf_check_arch(hdr)
|| hdr->e_shentsize != sizeof(*sechdrs)) {
diff --git a/kernel/relay.c b/kernel/relay.c
index bc24dcdc570f..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
ret = 0;
spliced = 0;
- while (len) {
+ while (len && !spliced) {
ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
if (ret < 0)
break;
diff --git a/kernel/sched.c b/kernel/sched.c
index e2e985eeee78..801abd319355 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -137,7 +137,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+ if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
return 1;
return 0;
}
@@ -399,43 +399,6 @@ struct cfs_rq {
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
- unsigned long task_weight;
- unsigned long shares;
- /*
- * We need space to build a sched_domain wide view of the full task
- * group tree, in order to avoid depending on dynamic memory allocation
- * during the load balancing we place this in the per cpu task group
- * hierarchy. This limits the load balancing to one instance per cpu,
- * but more should not be needed anyway.
- */
- struct aggregate_struct {
- /*
- * load = weight(cpus) * f(tg)
- *
- * Where f(tg) is the recursive weight fraction assigned to
- * this group.
- */
- unsigned long load;
-
- /*
- * part of the group weight distributed to this span.
- */
- unsigned long shares;
-
- /*
- * The sum of all runqueue weights within this span.
- */
- unsigned long rq_weight;
-
- /*
- * Weight contributed by tasks; this is the part we can
- * influence by moving tasks around.
- */
- unsigned long task_weight;
- } aggregate;
-#endif
#endif
};
@@ -633,6 +596,8 @@ static inline void update_rq_clock(struct rq *rq)
rq->clock = sched_clock_cpu(cpu_of(rq));
}
+#include "sched_trace.h"
+
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -884,7 +849,7 @@ static unsigned long long __cpu_clock(int cpu)
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
*/
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
{
unsigned long long prev_cpu_time, time, delta_time;
unsigned long flags;
@@ -1387,9 +1352,6 @@ static void __resched_task(struct task_struct *p, int tif_bit)
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
@@ -1412,6 +1374,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+ return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -1524,326 +1492,6 @@ static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- * root 1 - thread
- * / | \ A - group
- * A 1 B
- * /|\ / \
- * C 2 D 3 4
- * | |
- * 5 6
- *
- * load:
- * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- * which equals 1/9-th of the total load.
- *
- * shares:
- * The weight of this group on the selected cpus.
- *
- * rq_weight:
- * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- * B would get 2.
- *
- * task_weight:
- * Part of the rq_weight contributed by tasks; all groups except B would
- * get 1, B gets 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
-{
- return &tg->cfs_rq[sd->first_cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
- struct sched_domain *sd)
-{
- struct task_group *parent, *child;
-
- rcu_read_lock();
- parent = &root_task_group;
-down:
- (*down)(parent, sd);
- list_for_each_entry_rcu(child, &parent->children, siblings) {
- parent = child;
- goto down;
-
-up:
- continue;
- }
- (*up)(parent, sd);
-
- child = parent;
- parent = parent->parent;
- if (parent)
- goto up;
- rcu_read_unlock();
-}
-
-/*
- * Calculate the aggregate runqueue weight.
- */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long rq_weight = 0;
- unsigned long task_weight = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- rq_weight += tg->cfs_rq[i]->load.weight;
- task_weight += tg->cfs_rq[i]->task_weight;
- }
-
- aggregate(tg, sd)->rq_weight = rq_weight;
- aggregate(tg, sd)->task_weight = task_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span)
- shares += tg->cfs_rq[i]->shares;
-
- if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- aggregate(tg, sd)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long load;
-
- if (!tg->parent) {
- int i;
-
- load = 0;
- for_each_cpu_mask(i, sd->span)
- load += cpu_rq(i)->load.weight;
-
- } else {
- load = aggregate(tg->parent, sd)->load;
-
- /*
- * shares is our weight in the parent's rq so
- * shares/parent->rq_weight gives our fraction of the load
- */
- load *= aggregate(tg, sd)->shares;
- load /= aggregate(tg->parent, sd)->rq_weight + 1;
- }
-
- aggregate(tg, sd)->load = load;
-}
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
- int tcpu)
-{
- int boost = 0;
- unsigned long shares;
- unsigned long rq_weight;
-
- if (!tg->se[tcpu])
- return;
-
- rq_weight = tg->cfs_rq[tcpu]->load.weight;
-
- /*
- * If there are currently no tasks on the cpu pretend there is one of
- * average load so that when a new task gets to run here it will not
- * get delayed by group starvation.
- */
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum shares * rq_weight
- * shares = -----------------------
- * \Sum rq_weight
- *
- */
- shares = aggregate(tg, sd)->shares * rq_weight;
- shares /= aggregate(tg, sd)->rq_weight + 1;
-
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
-
- __set_se_shares(tg->se[tcpu], shares);
-}
-
-/*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
- */
-static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- unsigned long shares;
-
- shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
- __update_group_shares_cpu(tg, sd, scpu);
- __update_group_shares_cpu(tg, sd, dcpu);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
- if (shares)
- tg->cfs_rq[dcpu]->shares += shares;
-}
-
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- while (tg) {
- __move_group_shares(tg, sd, scpu, dcpu);
- tg = tg->parent;
- }
-}
-
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = aggregate(tg, sd)->shares;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, sd, i);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
-
- aggregate_group_shares(tg, sd);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= aggregate(tg, sd)->shares;
- if (shares) {
- tg->cfs_rq[sd->first_cpu]->shares += shares;
- aggregate(tg, sd)->shares += shares;
- }
-}
-
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_weight(tg, sd);
- aggregate_group_shares(tg, sd);
- aggregate_group_load(tg, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_set_shares(tg, sd);
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-
-static void __init init_aggregate(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- spin_lock_init(&per_cpu(aggregate_lock, i));
-}
-
-static int get_aggregate(struct sched_domain *sd)
-{
- if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
- return 0;
-
- aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
- return 1;
-}
-
-static void put_aggregate(struct sched_domain *sd)
-{
- spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
-}
-
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
- cfs_rq->shares = shares;
-}
-
-#else
-
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(struct sched_domain *sd)
-{
- return 0;
-}
-
-static inline void put_aggregate(struct sched_domain *sd)
-{
-}
-#endif
-
#else /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1864,14 +1512,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
+ inc_load(rq, p);
}
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
+ dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -1963,7 +1623,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
/*
@@ -1975,7 +1635,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
+ dec_nr_running(p, rq);
}
/**
@@ -2157,6 +1817,7 @@ void wait_task_inactive(struct task_struct *p)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
+ trace_kernel_sched_wait(p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
task_rq_unlock(rq, &flags);
@@ -2500,9 +2161,7 @@ out_activate:
success = 1;
out_running:
- trace_mark(kernel_sched_wakeup,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_kernel_sched_wakeup(rq, p);
check_preempt_curr(rq, p);
p->state = TASK_RUNNING;
@@ -2631,11 +2290,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
- trace_mark(kernel_sched_wakeup_new,
- "pid %d state %ld ## rq %p task %p rq->curr %p",
- p->pid, p->state, rq, p, rq->curr);
+ trace_kernel_sched_wakeup_new(rq, p);
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -2808,11 +2465,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_mark(kernel_sched_schedule,
- "prev_pid %d next_pid %d prev_state %ld "
- "## rq %p prev %p next %p",
- prev->pid, next->pid, prev->state,
- rq, prev, next);
+
+ trace_kernel_sched_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3045,6 +2699,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
|| unlikely(cpu_is_offline(dest_cpu)))
goto out;
+ trace_kernel_sched_migrate_task(p, cpu_of(rq), dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
@@ -3630,12 +3285,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- int unlock_aggregate;
cpus_setall(*cpus);
- unlock_aggregate = get_aggregate(sd);
-
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3751,9 +3403,8 @@ redo:
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
- goto out;
+ return -1;
+ return ld_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -3768,13 +3419,8 @@ out_one_pinned:
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
-out:
- if (unlock_aggregate)
- put_aggregate(sd);
- return ld_moved;
+ return -1;
+ return 0;
}
/*
@@ -4481,7 +4127,7 @@ static inline void schedule_debug(struct task_struct *prev)
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
+ if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
__schedule_bug(prev);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4982,8 +4628,10 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq)
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ dec_load(rq, p);
+ }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4993,6 +4641,7 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -7367,7 +7016,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
sd->span = *cpu_map;
- sd->first_cpu = first_cpu(sd->span);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
@@ -7378,7 +7026,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), &sd->span);
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7390,7 +7037,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
sd->span = *nodemask;
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7402,7 +7048,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, MC);
set_domain_attribute(sd, attr);
sd->span = cpu_coregroup_map(i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7415,7 +7060,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
sd->span = per_cpu(cpu_sibling_map, i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7619,8 +7263,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
static cpumask_t *doms_cur; /* current sched domains */
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
- in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
/*
* Special case: If a kmalloc of a doms_cur partition (array of
@@ -8085,7 +7729,6 @@ void __init sched_init(void)
}
#ifdef CONFIG_SMP
- init_aggregate();
init_defrootdomain();
#endif
@@ -8650,11 +8293,14 @@ void sched_move_task(struct task_struct *tsk)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
int on_rq;
+ spin_lock_irq(&rq->lock);
+
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
@@ -8664,17 +8310,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
-}
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- spin_unlock_irqrestore(&rq->lock, flags);
+ spin_unlock_irq(&rq->lock);
}
static DEFINE_MUTEX(shares_mutex);
@@ -8713,13 +8350,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
+ for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
- }
/*
* Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9c597e37f7de..ce05271219ab 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -59,22 +59,26 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
return &per_cpu(sched_clock_data, cpu);
}
+static __read_mostly int sched_clock_running;
+
void sched_clock_init(void)
{
u64 ktime_now = ktime_to_ns(ktime_get());
- u64 now = 0;
+ unsigned long now_jiffies = jiffies;
int cpu;
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- scd->prev_jiffies = jiffies;
- scd->prev_raw = now;
- scd->tick_raw = now;
+ scd->prev_jiffies = now_jiffies;
+ scd->prev_raw = 0;
+ scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
}
+
+ sched_clock_running = 1;
}
/*
@@ -136,6 +140,9 @@ u64 sched_clock_cpu(int cpu)
struct sched_clock_data *scd = cpu_sdc(cpu);
u64 now, clock;
+ if (unlikely(!sched_clock_running))
+ return 0ull;
+
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
@@ -174,6 +181,9 @@ void sched_clock_tick(void)
struct sched_clock_data *scd = this_scd();
u64 now, now_gtod;
+ if (unlikely(!sched_clock_running))
+ return;
+
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5f06118fbc31..8bb713040ac9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
-#endif
}
static void print_cpu(struct seq_file *m, int cpu)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..08ae848b71d4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
#endif
/*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- se->load.weight, &cfs_rq_of(se)->load);
- }
-
- return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, &se->load);
- }
-
- return delta;
-}
-
-/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ u64 slice = __sched_period(cfs_rq->nr_running);
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+ }
+
+
+ return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
*/
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
+ unsigned long weight;
+ u64 vslice;
if (!se->on_rq)
nr_running++;
- return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- * -20 |
- * |
- * 0 --------+-------
- * .'
- * 19 .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
- struct load_weight lw = {
- .weight = NICE_0_LOAD,
- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
- };
+ vslice = __sched_period(nr_running);
for_each_sched_entity(se) {
- struct load_weight *se_lw = &se->load;
+ cfs_rq = cfs_rq_of(se);
- if (se->load.weight < NICE_0_LOAD)
- se_lw = &lw;
+ weight = cfs_rq->load.weight;
+ if (!se->on_rq)
+ weight += se->load.weight;
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, se_lw);
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, weight);
}
- return delta;
+ return vslice;
}
/*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+ delta_exec_weighted = delta_exec;
+ if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+ delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+ &curr->load);
+ }
curr->vruntime += delta_exec_weighted;
}
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
- cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- inc_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- dec_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, -se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
list_del_init(&se->group_node);
@@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS)) {
- unsigned long thresh = sysctl_sched_latency;
-
- /*
- * convert the sleeper threshold into virtual time
- */
- if (sched_feat(NORMALIZED_SLEEPER))
- thresh = calc_delta_fair(thresh, se);
-
- vruntime -= thresh;
- }
+ if (sched_feat(NEW_FAIR_SLEEPERS))
+ vruntime -= sysctl_sched_latency;
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *curr = this_rq->curr;
unsigned long tl = this_load;
unsigned long tl_per_task;
+ int balanced;
- if (!(this_sd->flags & SD_WAKE_AFFINE))
+ if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
/*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+
+ /*
* If the currently running task will sleep within
* a reasonable amount of time then attract this newly
* woken task:
*/
- if (sync && curr->sched_class == &fair_sched_class) {
+ if (sync && balanced && curr->sched_class == &fair_sched_class) {
if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
p->se.avg_overlap < sysctl_sched_migration_cost)
return 1;
@@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
+ balanced) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
- * More easily preempt - nice tasks, while not making it harder for
- * + nice tasks.
+ * More easily preempt - nice tasks, while not making
+ * it harder for + nice tasks.
*/
- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+ if (unlikely(se->load.weight > NICE_0_LOAD))
+ gran = calc_delta_fair(gran, &se->load);
return gran;
}
@@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
- struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
- struct rq_iterator cfs_rq_iterator;
+ struct sched_entity *curr;
+ struct task_struct *p;
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
- cfs_rq_iterator.arg = cfs_rq;
+ if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+ return MAX_PRIO;
+
+ curr = cfs_rq->curr;
+ if (!curr)
+ curr = __pick_next_entity(cfs_rq);
+
+ p = task_of(curr);
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &cfs_rq_iterator);
+ return p->prio;
}
+#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
+ struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
- int busiest_cpu = cpu_of(busiest);
- struct task_group *tg;
-
- rcu_read_lock();
- list_for_each_entry(tg, &task_groups, list) {
- long imbalance;
- unsigned long this_weight, busiest_weight;
- long rem_load, max_load, moved_load;
-
- /*
- * empty group
- */
- if (!aggregate(tg, sd)->task_weight)
- continue;
-
- rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
- rem_load /= aggregate(tg, sd)->load + 1;
-
- this_weight = tg->cfs_rq[this_cpu]->task_weight;
- busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+ struct rq_iterator cfs_rq_iterator;
- imbalance = (busiest_weight - this_weight) / 2;
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
- if (imbalance < 0)
- imbalance = busiest_weight;
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct cfs_rq *this_cfs_rq;
+ long imbalance;
+ unsigned long maxload;
- max_load = max(rem_load, imbalance);
- moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
- max_load, sd, idle, all_pinned, this_best_prio,
- tg->cfs_rq[busiest_cpu]);
+ this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
- if (!moved_load)
+ imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+ /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+ if (imbalance <= 0)
continue;
- move_group_shares(tg, sd, busiest_cpu, this_cpu);
+ /* Don't pull more than imbalance/2 */
+ imbalance /= 2;
+ maxload = min(rem_load_move, imbalance);
- moved_load *= aggregate(tg, sd)->load;
- moved_load /= aggregate(tg, sd)->rq_weight + 1;
+ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+ maxload, sd, idle, all_pinned,
+ this_best_prio,
+ &cfs_rq_iterator);
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
+ if (rem_load_move <= 0)
break;
}
- rcu_read_unlock();
return max_load_move - rem_load_move;
}
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
-{
- return __load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
-}
-#endif
static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..3432d573205d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
-
- dec_cpu_load(rq, p->se.load.weight);
}
/*
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5bae2e0c3ff2..a38878e0e49d 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -67,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
preempt_enable();
#endif
}
+ kfree(mask_str);
return 0;
}
diff --git a/kernel/sched_trace.h b/kernel/sched_trace.h
new file mode 100644
index 000000000000..29b48f34fd02
--- /dev/null
+++ b/kernel/sched_trace.h
@@ -0,0 +1,41 @@
+#include <linux/marker.h>
+
+static inline void trace_kernel_sched_wait(struct task_struct *p)
+{
+ trace_mark(kernel_sched_wait_task, "pid %d state %ld",
+ p->pid, p->state);
+}
+
+static inline
+void trace_kernel_sched_wakeup(struct rq *rq, struct task_struct *p)
+{
+ trace_mark(kernel_sched_wakeup,
+ "pid %d state %ld ## rq %p task %p rq->curr %p",
+ p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline
+void trace_kernel_sched_wakeup_new(struct rq *rq, struct task_struct *p)
+{
+ trace_mark(kernel_sched_wakeup_new,
+ "pid %d state %ld ## rq %p task %p rq->curr %p",
+ p->pid, p->state, rq, p, rq->curr);
+}
+
+static inline void trace_kernel_sched_switch(struct rq *rq,
+ struct task_struct *prev, struct task_struct *next)
+{
+ trace_mark(kernel_sched_schedule,
+ "prev_pid %d next_pid %d prev_state %ld "
+ "## rq %p prev %p next %p",
+ prev->pid, next->pid, prev->state,
+ rq, prev, next);
+}
+
+static inline void
+trace_kernel_sched_migrate_task(struct task_struct *p, int src, int dst)
+{
+ trace_mark(kernel_sched_migrate_task,
+ "pid %d state %ld dest_cpu %d",
+ p->pid, p->state, dst);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index 72bb4f51f963..6c0958e52ea7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t)
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+ sigset_t signal, retain;
+ struct sigqueue *q, *n;
+
+ signal = pending->signal;
+ sigemptyset(&retain);
+
+ list_for_each_entry_safe(q, n, &pending->list, list) {
+ int sig = q->info.si_signo;
+
+ if (likely(q->info.si_code != SI_TIMER)) {
+ sigaddset(&retain, sig);
+ } else {
+ sigdelset(&signal, sig);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+
+ sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ __flush_itimer_signals(&tsk->pending);
+ __flush_itimer_signals(&tsk->signal->shared_pending);
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
void ignore_signals(struct task_struct *t)
{
int i;
@@ -1240,17 +1274,22 @@ void sigqueue_free(struct sigqueue *q)
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
/*
- * If the signal is still pending remove it from the
- * pending queue. We must hold ->siglock while testing
- * q->list to serialize with collect_signal().
+ * We must hold ->siglock while testing q->list
+ * to serialize with collect_signal() or with
+ * __exit_signal()->flush_sigqueue().
*/
spin_lock_irqsave(lock, flags);
+ q->flags &= ~SIGQUEUE_PREALLOC;
+ /*
+ * If it is queued it will be freed when dequeued,
+ * like the "regular" sigqueue.
+ */
if (!list_empty(&q->list))
- list_del_init(&q->list);
+ q = NULL;
spin_unlock_irqrestore(lock, flags);
- q->flags &= ~SIGQUEUE_PREALLOC;
- __sigqueue_free(q);
+ if (q)
+ __sigqueue_free(q);
}
int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0101aeef7ed7..b7350bbfb076 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -62,8 +62,7 @@ static int stopmachine(void *cpu)
* help our sisters onto their CPUs. */
if (!prepared && !irqs_disabled)
yield();
- else
- cpu_relax();
+ cpu_relax();
}
/* Ack: we are exiting. */
@@ -106,8 +105,10 @@ static int stop_machine(void)
}
/* Wait for them all to come to life. */
- while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
yield();
+ cpu_relax();
+ }
/* If some failed, kill them all. */
if (ret < 0) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 895d2d4c9493..14e97282eb6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask)
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
- long uninitialized_var(error);
+ long error = 0;
if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
return error;
@@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
error = PR_TIMING_STATISTICAL;
break;
case PR_SET_TIMING:
- if (arg2 == PR_TIMING_STATISTICAL)
- error = 0;
- else
+ if (arg2 != PR_TIMING_STATISTICAL)
error = -EINVAL;
break;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7aec123ec1d8..71d17de17288 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,5 +19,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 89bd9a6f52ec..0118979e211f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
static struct task_struct *ftraced_task;
-static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
-static unsigned long ftraced_iteration_counter;
enum {
FTRACE_ENABLE_CALLS = (1 << 0),
@@ -170,7 +168,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
static DEFINE_SPINLOCK(ftrace_shutdown_lock);
static DEFINE_MUTEX(ftraced_lock);
-static DEFINE_MUTEX(ftrace_filter_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
struct ftrace_page *next;
@@ -189,6 +187,7 @@ static struct ftrace_page *ftrace_pages;
static int ftraced_trigger;
static int ftraced_suspend;
+static int ftraced_stop;
static int ftrace_record_suspend;
@@ -201,7 +200,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key)
struct hlist_node *t;
int found = 0;
- hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+ hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
if (p->ip == ip) {
found = 1;
break;
@@ -214,7 +213,13 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key)
static inline void
ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
{
- hlist_add_head(&node->node, &ftrace_hash[key]);
+ hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+ hlist_del(&node->node);
}
static void ftrace_free_rec(struct dyn_ftrace *rec)
@@ -301,13 +306,6 @@ ftrace_record_ip(unsigned long ip)
if (ftrace_ip_in_hash(ip, key))
goto out_unlock;
- /*
- * There's a slight race that the ftraced will update the
- * hash and reset here. If it is already converted, skip it.
- */
- if (ftrace_ip_converted(ip))
- goto out_unlock;
-
node = ftrace_alloc_dyn_node(ip);
if (!node)
goto out_unlock;
@@ -333,17 +331,15 @@ ftrace_record_ip(unsigned long ip)
#define FTRACE_ADDR ((long)(ftrace_caller))
#define MCOUNT_ADDR ((long)(mcount))
-static void
+static int
__ftrace_replace_code(struct dyn_ftrace *rec,
unsigned char *old, unsigned char *new, int enable)
{
- unsigned long ip;
- int failed;
+ unsigned long ip, fl;
ip = rec->ip;
if (ftrace_filtered && enable) {
- unsigned long fl;
/*
* If filtering is on:
*
@@ -356,14 +352,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
* If this record is not set to be filtered
* and it is not enabled do nothing.
*
+ * If this record is set not to trace then
+ * do nothing.
+ *
* If this record is not set to be filtered and
* it is enabled, disable it.
*/
fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
- (fl == 0))
- return;
+ (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
+ return 0;
/*
* If it is enabled disable it,
@@ -380,41 +379,39 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
}
} else {
- if (enable)
+ if (enable) {
+ /*
+ * If this record is set not to trace and is
+ * not enabled, do nothing.
+ */
+ fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+ if (fl == FTRACE_FL_NOTRACE)
+ return 0;
+
new = ftrace_call_replace(ip, FTRACE_ADDR);
- else
+ } else
old = ftrace_call_replace(ip, FTRACE_ADDR);
if (enable) {
if (rec->flags & FTRACE_FL_ENABLED)
- return;
+ return 0;
rec->flags |= FTRACE_FL_ENABLED;
} else {
if (!(rec->flags & FTRACE_FL_ENABLED))
- return;
+ return 0;
rec->flags &= ~FTRACE_FL_ENABLED;
}
}
- failed = ftrace_modify_code(ip, old, new);
- if (failed) {
- unsigned long key;
- /* It is possible that the function hasn't been converted yet */
- key = hash_long(ip, FTRACE_HASHBITS);
- if (!ftrace_ip_in_hash(ip, key)) {
- rec->flags |= FTRACE_FL_FAILED;
- ftrace_free_rec(rec);
- }
-
- }
+ return ftrace_modify_code(ip, old, new);
}
static void ftrace_replace_code(int enable)
{
+ int i, failed;
unsigned char *new = NULL, *old = NULL;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- int i;
if (enable)
old = ftrace_nop_replace();
@@ -429,7 +426,15 @@ static void ftrace_replace_code(int enable)
if (rec->flags & FTRACE_FL_FAILED)
continue;
- __ftrace_replace_code(rec, old, new, enable);
+ failed = __ftrace_replace_code(rec, old, new, enable);
+ if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+ rec->flags |= FTRACE_FL_FAILED;
+ if ((system_state == SYSTEM_BOOTING) ||
+ !core_kernel_text(rec->ip)) {
+ ftrace_del_hash(rec);
+ ftrace_free_rec(rec);
+ }
+ }
}
}
}
@@ -443,7 +448,7 @@ static void ftrace_shutdown_replenish(void)
ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
}
-static void
+static int
ftrace_code_disable(struct dyn_ftrace *rec)
{
unsigned long ip;
@@ -458,18 +463,26 @@ ftrace_code_disable(struct dyn_ftrace *rec)
failed = ftrace_modify_code(ip, call, nop);
if (failed) {
rec->flags |= FTRACE_FL_FAILED;
- ftrace_free_rec(rec);
+ return 0;
}
+ return 1;
}
+static int __ftrace_update_code(void *ignore);
+
static int __ftrace_modify_code(void *data)
{
unsigned long addr;
int *command = data;
- if (*command & FTRACE_ENABLE_CALLS)
+ if (*command & FTRACE_ENABLE_CALLS) {
+ /*
+ * Update any recorded ips now that we have the
+ * machine stopped
+ */
+ __ftrace_update_code(NULL);
ftrace_replace_code(1);
- else if (*command & FTRACE_DISABLE_CALLS)
+ } else if (*command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
if (*command & FTRACE_UPDATE_TRACE_FUNC)
@@ -491,6 +504,25 @@ static void ftrace_run_update_code(int command)
stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
}
+void ftrace_disable_daemon(void)
+{
+ /* Stop the daemon from calling kstop_machine */
+ mutex_lock(&ftraced_lock);
+ ftraced_stop = 1;
+ mutex_unlock(&ftraced_lock);
+
+ ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+ mutex_lock(&ftraced_lock);
+ ftraced_stop = 0;
+ mutex_unlock(&ftraced_lock);
+
+ ftrace_force_update();
+}
+
static ftrace_func_t saved_ftrace_func;
static void ftrace_startup(void)
@@ -584,13 +616,13 @@ unsigned long ftrace_update_tot_cnt;
static int __ftrace_update_code(void *ignore)
{
struct dyn_ftrace *p;
- struct hlist_head head;
- struct hlist_node *t;
+ struct hlist_node *t, *n;
int save_ftrace_enabled;
cycle_t start, stop;
int i;
/* Don't be recording funcs now */
+ ftrace_record_suspend++;
save_ftrace_enabled = ftrace_enabled;
ftrace_enabled = 0;
@@ -599,35 +631,54 @@ static int __ftrace_update_code(void *ignore)
/* No locks needed, the machine is stopped! */
for (i = 0; i < FTRACE_HASHSIZE; i++) {
- if (hlist_empty(&ftrace_hash[i]))
- continue;
+ /* all CPUS are stopped, we are safe to modify code */
+ hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) {
+ /* Skip over failed records which have not been
+ * freed. */
+ if (p->flags & FTRACE_FL_FAILED)
+ continue;
- head = ftrace_hash[i];
- INIT_HLIST_HEAD(&ftrace_hash[i]);
+ /* Unconverted records are always at the head of the
+ * hash bucket. Once we encounter a converted record,
+ * simply skip over to the next bucket. Saves ftraced
+ * some processor cycles (ftrace does its bid for
+ * global warming :-p ). */
+ if (p->flags & (FTRACE_FL_CONVERTED))
+ break;
- /* all CPUS are stopped, we are safe to modify code */
- hlist_for_each_entry(p, t, &head, node) {
- ftrace_code_disable(p);
- ftrace_update_cnt++;
+ if (ftrace_code_disable(p)) {
+ p->flags |= FTRACE_FL_CONVERTED;
+ ftrace_update_cnt++;
+ } else {
+ if ((system_state == SYSTEM_BOOTING) ||
+ !core_kernel_text(p->ip)) {
+ ftrace_del_hash(p);
+ ftrace_free_rec(p);
+ }
+ }
}
-
}
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
ftrace_update_tot_cnt += ftrace_update_cnt;
+ ftraced_trigger = 0;
ftrace_enabled = save_ftrace_enabled;
+ ftrace_record_suspend--;
return 0;
}
-static void ftrace_update_code(void)
+static int ftrace_update_code(void)
{
- if (unlikely(ftrace_disabled))
- return;
+ if (unlikely(ftrace_disabled) ||
+ !ftrace_enabled || !ftraced_trigger)
+ return 0;
stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+ return 1;
}
static int ftraced(void *ignore)
@@ -646,14 +697,13 @@ static int ftraced(void *ignore)
mutex_lock(&ftrace_sysctl_lock);
mutex_lock(&ftraced_lock);
- if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
- ftrace_record_suspend++;
- ftrace_update_code();
+ if (!ftraced_suspend && !ftraced_stop &&
+ ftrace_update_code()) {
usecs = nsecs_to_usecs(ftrace_update_time);
if (ftrace_update_tot_cnt > 100000) {
ftrace_update_tot_cnt = 0;
pr_info("hm, dftrace overflow: %lu change%s"
- " (%lu total) in %lu usec%s\n",
+ " (%lu total) in %lu usec%s\n",
ftrace_update_cnt,
ftrace_update_cnt != 1 ? "s" : "",
ftrace_update_tot_cnt,
@@ -661,15 +711,10 @@ static int ftraced(void *ignore)
ftrace_disabled = 1;
WARN_ON_ONCE(1);
}
- ftraced_trigger = 0;
- ftrace_record_suspend--;
}
- ftraced_iteration_counter++;
mutex_unlock(&ftraced_lock);
mutex_unlock(&ftrace_sysctl_lock);
- wake_up_interruptible(&ftraced_waiters);
-
ftrace_shutdown_replenish();
}
__set_current_state(TASK_RUNNING);
@@ -721,6 +766,8 @@ static int __init ftrace_dyn_table_alloc(void)
enum {
FTRACE_ITER_FILTER = (1 << 0),
FTRACE_ITER_CONT = (1 << 1),
+ FTRACE_ITER_NOTRACE = (1 << 2),
+ FTRACE_ITER_FAILURES = (1 << 3),
};
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -752,9 +799,18 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
} else {
rec = &iter->pg->records[iter->idx++];
- if ((rec->flags & FTRACE_FL_FAILED) ||
+ if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+ (rec->flags & FTRACE_FL_FAILED)) ||
+
+ ((iter->flags & FTRACE_ITER_FAILURES) &&
+ (!(rec->flags & FTRACE_FL_FAILED) ||
+ (rec->flags & FTRACE_FL_FREE))) ||
+
((iter->flags & FTRACE_ITER_FILTER) &&
- !(rec->flags & FTRACE_FL_FILTER))) {
+ !(rec->flags & FTRACE_FL_FILTER)) ||
+
+ ((iter->flags & FTRACE_ITER_NOTRACE) &&
+ !(rec->flags & FTRACE_FL_NOTRACE))) {
rec = NULL;
goto retry;
}
@@ -847,22 +903,42 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
return 0;
}
-static void ftrace_filter_reset(void)
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ struct ftrace_iterator *iter;
+
+ ret = ftrace_avail_open(inode, file);
+ if (!ret) {
+ m = (struct seq_file *)file->private_data;
+ iter = (struct ftrace_iterator *)m->private;
+ iter->flags = FTRACE_ITER_FAILURES;
+ }
+
+ return ret;
+}
+
+
+static void ftrace_filter_reset(int enable)
{
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
unsigned i;
/* keep kstop machine from running */
preempt_disable();
- ftrace_filtered = 0;
+ if (enable)
+ ftrace_filtered = 0;
pg = ftrace_pages_start;
while (pg) {
for (i = 0; i < pg->index; i++) {
rec = &pg->records[i];
if (rec->flags & FTRACE_FL_FAILED)
continue;
- rec->flags &= ~FTRACE_FL_FILTER;
+ rec->flags &= ~type;
}
pg = pg->next;
}
@@ -870,7 +946,7 @@ static void ftrace_filter_reset(void)
}
static int
-ftrace_filter_open(struct inode *inode, struct file *file)
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
{
struct ftrace_iterator *iter;
int ret = 0;
@@ -882,15 +958,16 @@ ftrace_filter_open(struct inode *inode, struct file *file)
if (!iter)
return -ENOMEM;
- mutex_lock(&ftrace_filter_lock);
+ mutex_lock(&ftrace_regex_lock);
if ((file->f_mode & FMODE_WRITE) &&
!(file->f_flags & O_APPEND))
- ftrace_filter_reset();
+ ftrace_filter_reset(enable);
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
iter->pos = -1;
- iter->flags = FTRACE_ITER_FILTER;
+ iter->flags = enable ? FTRACE_ITER_FILTER :
+ FTRACE_ITER_NOTRACE;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -900,13 +977,25 @@ ftrace_filter_open(struct inode *inode, struct file *file)
kfree(iter);
} else
file->private_data = iter;
- mutex_unlock(&ftrace_filter_lock);
+ mutex_unlock(&ftrace_regex_lock);
return ret;
}
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(inode, file, 0);
+}
+
static ssize_t
-ftrace_filter_read(struct file *file, char __user *ubuf,
+ftrace_regex_read(struct file *file, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
if (file->f_mode & FMODE_READ)
@@ -916,7 +1005,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf,
}
static loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
{
loff_t ret;
@@ -936,13 +1025,14 @@ enum {
};
static void
-ftrace_match(unsigned char *buff, int len)
+ftrace_match(unsigned char *buff, int len, int enable)
{
char str[KSYM_SYMBOL_LEN];
char *search = NULL;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
int type = MATCH_FULL;
+ unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
unsigned i, match = 0, search_len = 0;
for (i = 0; i < len; i++) {
@@ -966,7 +1056,8 @@ ftrace_match(unsigned char *buff, int len)
/* keep kstop machine from running */
preempt_disable();
- ftrace_filtered = 1;
+ if (enable)
+ ftrace_filtered = 1;
pg = ftrace_pages_start;
while (pg) {
for (i = 0; i < pg->index; i++) {
@@ -997,7 +1088,7 @@ ftrace_match(unsigned char *buff, int len)
break;
}
if (matched)
- rec->flags |= FTRACE_FL_FILTER;
+ rec->flags |= flag;
}
pg = pg->next;
}
@@ -1005,8 +1096,8 @@ ftrace_match(unsigned char *buff, int len)
}
static ssize_t
-ftrace_filter_write(struct file *file, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, int enable)
{
struct ftrace_iterator *iter;
char ch;
@@ -1016,7 +1107,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
if (!cnt || cnt < 0)
return 0;
- mutex_lock(&ftrace_filter_lock);
+ mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
@@ -1045,7 +1136,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
cnt--;
}
-
if (isspace(ch)) {
file->f_pos += read;
ret = read;
@@ -1072,7 +1162,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
if (isspace(ch)) {
iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
- ftrace_match(iter->buffer, iter->buffer_idx);
+ ftrace_match(iter->buffer, iter->buffer_idx, enable);
iter->buffer_idx = 0;
} else
iter->flags |= FTRACE_ITER_CONT;
@@ -1082,11 +1172,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
ret = read;
out:
- mutex_unlock(&ftrace_filter_lock);
+ mutex_unlock(&ftrace_regex_lock);
return ret;
}
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+ if (unlikely(ftrace_disabled))
+ return;
+
+ mutex_lock(&ftrace_regex_lock);
+ if (reset)
+ ftrace_filter_reset(enable);
+ if (buf)
+ ftrace_match(buf, len, enable);
+ mutex_unlock(&ftrace_regex_lock);
+}
+
/**
* ftrace_set_filter - set a function to filter on in ftrace
* @buf - the string that holds the function filter text.
@@ -1098,24 +1216,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
*/
void ftrace_set_filter(unsigned char *buf, int len, int reset)
{
- if (unlikely(ftrace_disabled))
- return;
+ ftrace_set_regex(buf, len, reset, 1);
+}
- mutex_lock(&ftrace_filter_lock);
- if (reset)
- ftrace_filter_reset();
- if (buf)
- ftrace_match(buf, len);
- mutex_unlock(&ftrace_filter_lock);
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(buf, len, reset, 0);
}
static int
-ftrace_filter_release(struct inode *inode, struct file *file)
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
- mutex_lock(&ftrace_filter_lock);
+ mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
iter = m->private;
@@ -1126,7 +1251,7 @@ ftrace_filter_release(struct inode *inode, struct file *file)
if (iter->buffer_idx) {
iter->filtered++;
iter->buffer[iter->buffer_idx] = 0;
- ftrace_match(iter->buffer, iter->buffer_idx);
+ ftrace_match(iter->buffer, iter->buffer_idx, enable);
}
mutex_lock(&ftrace_sysctl_lock);
@@ -1137,10 +1262,71 @@ ftrace_filter_release(struct inode *inode, struct file *file)
mutex_unlock(&ftrace_sysctl_lock);
kfree(iter);
- mutex_unlock(&ftrace_filter_lock);
+ mutex_unlock(&ftrace_regex_lock);
return 0;
}
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_release(inode, file, 0);
+}
+
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ /* don't worry about races */
+ char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+ int r = strlen(buf);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ if (strncmp(buf, "enable", 6) == 0)
+ val = 1;
+ else if (strncmp(buf, "disable", 7) == 0)
+ val = 0;
+ else {
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ val = !!val;
+ }
+
+ if (val)
+ ftrace_enable_daemon();
+ else
+ ftrace_disable_daemon();
+
+ filp->f_pos += cnt;
+
+ return cnt;
+}
+
static struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
@@ -1148,59 +1334,57 @@ static struct file_operations ftrace_avail_fops = {
.release = ftrace_avail_release,
};
+static struct file_operations ftrace_failures_fops = {
+ .open = ftrace_failures_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = ftrace_avail_release,
+};
+
static struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
- .read = ftrace_filter_read,
+ .read = ftrace_regex_read,
.write = ftrace_filter_write,
- .llseek = ftrace_filter_lseek,
+ .llseek = ftrace_regex_lseek,
.release = ftrace_filter_release,
};
+static struct file_operations ftrace_notrace_fops = {
+ .open = ftrace_notrace_open,
+ .read = ftrace_regex_read,
+ .write = ftrace_notrace_write,
+ .llseek = ftrace_regex_lseek,
+ .release = ftrace_notrace_release,
+};
+
+static struct file_operations ftraced_fops = {
+ .open = tracing_open_generic,
+ .read = ftraced_read,
+ .write = ftraced_write,
+};
+
/**
* ftrace_force_update - force an update to all recording ftrace functions
- *
- * The ftrace dynamic update daemon only wakes up once a second.
- * There may be cases where an update needs to be done immediately
- * for tests or internal kernel tracing to begin. This function
- * wakes the daemon to do an update and will not return until the
- * update is complete.
*/
int ftrace_force_update(void)
{
- unsigned long last_counter;
- DECLARE_WAITQUEUE(wait, current);
int ret = 0;
if (unlikely(ftrace_disabled))
return -ENODEV;
+ mutex_lock(&ftrace_sysctl_lock);
mutex_lock(&ftraced_lock);
- last_counter = ftraced_iteration_counter;
-
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&ftraced_waiters, &wait);
-
- if (unlikely(!ftraced_task)) {
- ret = -ENODEV;
- goto out;
- }
- do {
- mutex_unlock(&ftraced_lock);
- wake_up_process(ftraced_task);
- schedule();
- mutex_lock(&ftraced_lock);
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
- set_current_state(TASK_INTERRUPTIBLE);
- } while (last_counter == ftraced_iteration_counter);
+ /*
+ * If ftraced_trigger is not set, then there is nothing
+ * to update.
+ */
+ if (ftraced_trigger && !ftrace_update_code())
+ ret = -EBUSY;
- out:
mutex_unlock(&ftraced_lock);
- remove_wait_queue(&ftraced_waiters, &wait);
- set_current_state(TASK_RUNNING);
+ mutex_unlock(&ftrace_sysctl_lock);
return ret;
}
@@ -1234,11 +1418,28 @@ static __init int ftrace_init_debugfs(void)
pr_warning("Could not create debugfs "
"'available_filter_functions' entry\n");
+ entry = debugfs_create_file("failures", 0444,
+ d_tracer, NULL, &ftrace_failures_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'failures' entry\n");
+
entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
NULL, &ftrace_filter_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'set_ftrace_filter' entry\n");
+
+ entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+ NULL, &ftrace_notrace_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'set_ftrace_notrace' entry\n");
+
+ entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+ NULL, &ftraced_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs "
+ "'ftraced_enabled' entry\n");
return 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95b7c48a9a1d..2c6ffd6aad47 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -27,6 +27,7 @@
#include <linux/poll.h>
#include <linux/gfp.h>
#include <linux/fs.h>
+#include <linux/kprobes.h>
#include <linux/writeback.h>
#include <linux/stacktrace.h>
@@ -42,11 +43,6 @@ static cpumask_t __read_mostly tracing_buffer_mask;
#define for_each_tracing_cpu(cpu) \
for_each_cpu_mask(cpu, tracing_buffer_mask)
-/* dummy trace to disable tracing */
-static struct tracer no_tracer __read_mostly = {
- .name = "none",
-};
-
static int trace_alloc_page(void);
static int trace_free_page(void);
@@ -134,6 +130,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds iter_ctrl options */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+static notrace void no_trace_init(struct trace_array *tr)
+{
+ int cpu;
+
+ if(tr->ctrl)
+ for_each_online_cpu(cpu)
+ tracing_reset(tr->data[cpu]);
+ tracer_enabled = 0;
+}
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+ .name = "none",
+ .init = no_trace_init
+};
+
+
/**
* trace_wake_up - wake up tasks waiting for trace input
*
@@ -249,24 +262,32 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
tracing_record_cmdline(current);
}
+#define CHECK_COND(cond) \
+ if (unlikely(cond)) { \
+ tracing_disabled = 1; \
+ WARN_ON(1); \
+ return -1; \
+ }
+
/**
* check_pages - integrity check of trace buffers
*
* As a safty measure we check to make sure the data pages have not
- * been corrupted. TODO: configure to disable this because it adds
- * a bit of overhead.
+ * been corrupted.
*/
-void check_pages(struct trace_array_cpu *data)
+int check_pages(struct trace_array_cpu *data)
{
struct page *page, *tmp;
- BUG_ON(data->trace_pages.next->prev != &data->trace_pages);
- BUG_ON(data->trace_pages.prev->next != &data->trace_pages);
+ CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+ CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
- BUG_ON(page->lru.next->prev != &page->lru);
- BUG_ON(page->lru.prev->next != &page->lru);
+ CHECK_COND(page->lru.next->prev != &page->lru);
+ CHECK_COND(page->lru.prev->next != &page->lru);
}
+
+ return 0;
}
/**
@@ -280,7 +301,6 @@ void *head_page(struct trace_array_cpu *data)
{
struct page *page;
- check_pages(data);
if (list_empty(&data->trace_pages))
return NULL;
@@ -645,9 +665,6 @@ static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
static int cmdline_idx;
static DEFINE_SPINLOCK(trace_cmdline_lock);
-/* trace in all context switches */
-atomic_t trace_record_cmdline_enabled __read_mostly;
-
/* temporary disable recording */
atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -831,6 +848,48 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
trace_function(tr, data, ip, parent_ip, flags);
}
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw)
+{
+ struct trace_entry *entry;
+ unsigned long irq_flags;
+
+ raw_local_irq_save(irq_flags);
+ __raw_spin_lock(&data->lock);
+
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, 0);
+ entry->type = TRACE_MMIO_RW;
+ entry->mmiorw = *rw;
+
+ __raw_spin_unlock(&data->lock);
+ raw_local_irq_restore(irq_flags);
+
+ trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+ struct mmiotrace_map *map)
+{
+ struct trace_entry *entry;
+ unsigned long irq_flags;
+
+ raw_local_irq_save(irq_flags);
+ __raw_spin_lock(&data->lock);
+
+ entry = tracing_get_trace_entry(tr, data);
+ tracing_generic_entry_update(entry, 0);
+ entry->type = TRACE_MMIO_MAP;
+ entry->mmiomap = *map;
+
+ __raw_spin_unlock(&data->lock);
+ raw_local_irq_restore(irq_flags);
+
+ trace_wake_up();
+}
+#endif
+
void __trace_stack(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long flags,
@@ -934,6 +993,30 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
trace_wake_up();
}
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+ struct trace_array *tr = &global_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+
+ if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+ return;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1))
+ __trace_special(tr, data, arg1, arg2, arg3);
+
+ atomic_dec(&data->disabled);
+ local_irq_restore(flags);
+}
+
#ifdef CONFIG_FTRACE
static void
function_trace_call(unsigned long ip, unsigned long parent_ip)
@@ -1171,6 +1254,20 @@ static void s_stop(struct seq_file *m, void *p)
mutex_unlock(&trace_types_lock);
}
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+ return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+ return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
static int
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
@@ -1406,7 +1503,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
case TRACE_FN:
seq_print_ip_sym(s, entry->fn.ip, sym_flags);
trace_seq_puts(s, " (");
- seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+ if (kretprobed(entry->fn.parent_ip))
+ trace_seq_puts(s, KRETPROBE_MSG);
+ else
+ seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
trace_seq_puts(s, ")\n");
break;
case TRACE_CTX:
@@ -1486,8 +1586,11 @@ static int print_trace_fmt(struct trace_iterator *iter)
ret = trace_seq_printf(s, " <-");
if (!ret)
return 0;
- ret = seq_print_ip_sym(s, entry->fn.parent_ip,
- sym_flags);
+ if (kretprobed(entry->fn.parent_ip))
+ ret = trace_seq_puts(s, KRETPROBE_MSG);
+ else
+ ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+ sym_flags);
if (!ret)
return 0;
}
@@ -2566,7 +2669,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
{
unsigned long val;
char buf[64];
- int ret;
+ int i, ret;
if (cnt >= sizeof(buf))
return -EINVAL;
@@ -2635,8 +2738,15 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
trace_free_page();
}
+ /* check integrity */
+ for_each_tracing_cpu(i)
+ check_pages(global_trace.data[i]);
+
filp->f_pos += cnt;
+ /* If check pages failed, return ENOMEM */
+ if (tracing_disabled)
+ cnt = -ENOMEM;
out:
max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
@@ -2930,8 +3040,6 @@ __init static int tracer_alloc_buffers(void)
int ret = -ENOMEM;
int i;
- global_trace.ctrl = tracer_enabled;
-
/* TODO: make the number of buffers hot pluggable with CPUS */
tracing_nr_buffers = num_possible_cpus();
tracing_buffer_mask = cpu_possible_map;
@@ -3001,6 +3109,7 @@ __init static int tracer_alloc_buffers(void)
current_trace = &no_tracer;
/* All seems OK, enable tracing */
+ global_trace.ctrl = tracer_enabled;
tracing_disabled = 0;
return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b7f85d9c80d7..8cb215b239d5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,6 +5,7 @@
#include <asm/atomic.h>
#include <linux/sched.h>
#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -14,6 +15,8 @@ enum trace_type {
TRACE_WAKE,
TRACE_STACK,
TRACE_SPECIAL,
+ TRACE_MMIO_RW,
+ TRACE_MMIO_MAP,
__TRACE_LAST_TYPE
};
@@ -75,6 +78,8 @@ struct trace_entry {
struct ctx_switch_entry ctx;
struct special_entry special;
struct stack_entry stack;
+ struct mmiotrace_rw mmiorw;
+ struct mmiotrace_map mmiomap;
};
};
@@ -220,6 +225,8 @@ void trace_function(struct trace_array *tr,
void tracing_start_function_trace(void);
void tracing_stop_function_trace(void);
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
int register_tracer(struct tracer *type);
void unregister_tracer(struct tracer *type);
@@ -228,8 +235,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
extern unsigned long tracing_max_latency;
extern unsigned long tracing_thresh;
-extern atomic_t trace_record_cmdline_enabled;
-
void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
@@ -257,6 +262,15 @@ extern unsigned long ftrace_update_tot_cnt;
extern int DYN_FTRACE_TEST_NAME(void);
#endif
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+ struct trace_array_cpu *data,
+ struct mmiotrace_map *map);
+#endif
+
#ifdef CONFIG_FTRACE_STARTUP_TEST
#ifdef CONFIG_FTRACE
extern int trace_selftest_startup_function(struct tracer *trace,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0a084656d7cf..7ee7dcd76b7d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -29,14 +29,14 @@ static void function_reset(struct trace_array *tr)
static void start_function_trace(struct trace_array *tr)
{
function_reset(tr);
- atomic_inc(&trace_record_cmdline_enabled);
+ tracing_start_cmdline_record();
tracing_start_function_trace();
}
static void stop_function_trace(struct trace_array *tr)
{
tracing_stop_function_trace();
- atomic_dec(&trace_record_cmdline_enabled);
+ tracing_stop_cmdline_record();
}
static void function_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 761f3ec66c50..421d6fe3650e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -165,22 +165,6 @@ check_critical_timing(struct trace_array *tr,
update_max_tr_single(tr, current, cpu);
- if (!runqueue_is_locked()) {
- if (tracing_thresh) {
- printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical"
- " section violates %lu us threshold.\n",
- current->comm, current->pid,
- raw_smp_processor_id(),
- latency, nsecs_to_usecs(tracing_thresh));
- } else {
- printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us"
- " maximum-latency critical section.\n",
- current->comm, current->pid,
- raw_smp_processor_id(),
- latency);
- }
- }
-
max_sequence++;
out_unlock:
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+
+#include "trace.h"
+
+struct header_iter {
+ struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+ int cpu;
+
+ overrun_detected = false;
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr->data[cpu]);
+}
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ mmio_trace_array = tr;
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ }
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl)
+ disable_mmiotrace();
+ mmio_reset_data(tr);
+ mmio_trace_array = NULL;
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+ pr_debug("in %s\n", __func__);
+ if (tr->ctrl) {
+ mmio_reset_data(tr);
+ enable_mmiotrace();
+ } else {
+ disable_mmiotrace();
+ }
+}
+
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+ int ret = 0;
+ int i;
+ resource_size_t start, end;
+ const struct pci_driver *drv = pci_dev_driver(dev);
+
+ /* XXX: incomplete checks for trace_seq_printf() return value */
+ ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
+ /*
+ * XXX: is pci_resource_to_user() appropriate, since we are
+ * supposed to interpret the __ioremap() phys_addr argument based on
+ * these printed values?
+ */
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ (unsigned long long)(start |
+ (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+ }
+ for (i = 0; i < 7; i++) {
+ pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+ ret += trace_seq_printf(s, " %llx",
+ dev->resource[i].start < dev->resource[i].end ?
+ (unsigned long long)(end - start) + 1 : 0);
+ }
+ if (drv)
+ ret += trace_seq_printf(s, " %s\n", drv->name);
+ else
+ ret += trace_seq_printf(s, " \n");
+ return ret;
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+ if (!hiter)
+ return;
+ pci_dev_put(hiter->dev);
+ kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+ struct header_iter *hiter;
+ struct trace_seq *s = &iter->seq;
+
+ trace_seq_printf(s, "VERSION 20070824\n");
+
+ hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+ if (!hiter)
+ return;
+
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+ iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+ struct header_iter *hiter = iter->private;
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+ int cpu;
+ unsigned long cnt = 0;
+ for_each_online_cpu(cpu) {
+ cnt += iter->overrun[cpu];
+ iter->overrun[cpu] = 0;
+ }
+ return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+ char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ ssize_t ret;
+ struct header_iter *hiter = iter->private;
+ struct trace_seq *s = &iter->seq;
+ unsigned long n;
+
+ n = count_overruns(iter);
+ if (n) {
+ /* XXX: This is later than where events were lost. */
+ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+ if (!overrun_detected)
+ pr_warning("mmiotrace has lost events.\n");
+ overrun_detected = true;
+ goto print_out;
+ }
+
+ if (!hiter)
+ return 0;
+
+ mmio_print_pcidev(s, hiter->dev);
+ hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+ if (!hiter->dev) {
+ destroy_header_iter(hiter);
+ iter->private = NULL;
+ }
+
+print_out:
+ ret = trace_seq_to_user(s, ubuf, cnt);
+ return (ret == -EBUSY) ? 0 : ret;
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct mmiotrace_rw *rw = &entry->mmiorw;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(entry->t);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret = 1;
+
+ switch (entry->mmiorw.opcode) {
+ case MMIO_READ:
+ ret = trace_seq_printf(s,
+ "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_WRITE:
+ ret = trace_seq_printf(s,
+ "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+ rw->width, secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ rw->value, rw->pc, 0);
+ break;
+ case MMIO_UNKNOWN_OP:
+ ret = trace_seq_printf(s,
+ "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+ secs, usec_rem, rw->map_id,
+ (unsigned long long)rw->phys,
+ (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+ (rw->value >> 0) & 0xff, rw->pc, 0);
+ break;
+ default:
+ ret = trace_seq_printf(s, "rw what?\n");
+ break;
+ }
+ if (ret)
+ return 1;
+ return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+ struct trace_entry *entry = iter->ent;
+ struct mmiotrace_map *m = &entry->mmiomap;
+ struct trace_seq *s = &iter->seq;
+ unsigned long long t = ns2usecs(entry->t);
+ unsigned long usec_rem = do_div(t, 1000000ULL);
+ unsigned secs = (unsigned long)t;
+ int ret = 1;
+
+ switch (entry->mmiorw.opcode) {
+ case MMIO_PROBE:
+ ret = trace_seq_printf(s,
+ "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+ secs, usec_rem, m->map_id,
+ (unsigned long long)m->phys, m->virt, m->len,
+ 0UL, 0);
+ break;
+ case MMIO_UNPROBE:
+ ret = trace_seq_printf(s,
+ "UNMAP %lu.%06lu %d 0x%lx %d\n",
+ secs, usec_rem, m->map_id, 0UL, 0);
+ break;
+ default:
+ ret = trace_seq_printf(s, "map what?\n");
+ break;
+ }
+ if (ret)
+ return 1;
+ return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+ switch (iter->ent->type) {
+ case TRACE_MMIO_RW:
+ return mmio_print_rw(iter);
+ case TRACE_MMIO_MAP:
+ return mmio_print_map(iter);
+ default:
+ return 1; /* ignore unknown entries */
+ }
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+ .name = "mmiotrace",
+ .init = mmio_trace_init,
+ .reset = mmio_trace_reset,
+ .pipe_open = mmio_pipe_open,
+ .close = mmio_close,
+ .read = mmio_read,
+ .ctrl_update = mmio_trace_ctrl_update,
+ .print_line = mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+ return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_rw(tr, data, rw);
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+ struct trace_array *tr = mmio_trace_array;
+ struct trace_array_cpu *data;
+
+ preempt_disable();
+ data = tr->data[smp_processor_id()];
+ __trace_mmiotrace_map(tr, data, map);
+ preempt_enable();
+}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index d25ffa5eaf2b..c16935d3bc5c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,6 +29,9 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
long disabled;
int cpu;
+ tracing_record_cmdline(prev);
+ tracing_record_cmdline(next);
+
if (!tracer_enabled)
return;
@@ -63,8 +66,6 @@ sched_switch_callback(void *probe_data, void *call_data,
prev = va_arg(*args, typeof(prev));
next = va_arg(*args, typeof(next));
- tracing_record_cmdline(prev);
-
/*
* If tracer_switch_func only points to the local
* switch func, it still needs the ptr passed to it.
@@ -125,30 +126,6 @@ wake_up_callback(void *probe_data, void *call_data,
wakeup_func(probe_data, __rq, task, curr);
}
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
- struct trace_array *tr = ctx_trace;
- struct trace_array_cpu *data;
- unsigned long flags;
- long disabled;
- int cpu;
-
- if (!tracer_enabled)
- return;
-
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = tr->data[cpu];
- disabled = atomic_inc_return(&data->disabled);
-
- if (likely(disabled == 1))
- __trace_special(tr, data, arg1, arg2, arg3);
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
-}
-
static void sched_switch_reset(struct trace_array *tr)
{
int cpu;
@@ -237,18 +214,26 @@ void tracing_stop_sched_switch(void)
tracing_sched_unregister();
}
+void tracing_start_cmdline_record(void)
+{
+ tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+ tracing_stop_sched_switch();
+}
+
static void start_sched_trace(struct trace_array *tr)
{
sched_switch_reset(tr);
- atomic_inc(&trace_record_cmdline_enabled);
tracer_enabled = 1;
- tracing_start_sched_switch();
+ tracing_start_cmdline_record();
}
static void stop_sched_trace(struct trace_array *tr)
{
- tracing_stop_sched_switch();
- atomic_dec(&trace_record_cmdline_enabled);
+ tracing_stop_cmdline_record();
tracer_enabled = 0;
}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5d2fb48e47f8..bf7e91caef57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -30,6 +30,69 @@ static DEFINE_SPINLOCK(wakeup_lock);
static void __wakeup_reset(struct trace_array *tr);
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+ struct trace_array *tr = wakeup_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int resched;
+ int cpu;
+
+ if (likely(!wakeup_task))
+ return;
+
+ resched = need_resched();
+ preempt_disable_notrace();
+
+ cpu = raw_smp_processor_id();
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+ if (unlikely(disabled != 1))
+ goto out;
+
+ spin_lock_irqsave(&wakeup_lock, flags);
+
+ if (unlikely(!wakeup_task))
+ goto unlock;
+
+ /*
+ * The task can't disappear because it needs to
+ * wake up first, and we have the wakeup_lock.
+ */
+ if (task_cpu(wakeup_task) != cpu)
+ goto unlock;
+
+ trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+ spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+ atomic_dec(&data->disabled);
+
+ /*
+ * To prevent recursion from the scheduler, if the
+ * resched flag was set before we entered, then
+ * don't reschedule.
+ */
+ if (resched)
+ preempt_enable_no_resched_notrace();
+ else
+ preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+ .func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
/*
* Should this new latency be reported/recorded?
*/
@@ -73,7 +136,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
if (next != wakeup_task)
return;
- /* The task we are waitng for is waking up */
+ /* The task we are waiting for is waking up */
data = tr->data[wakeup_cpu];
/* disable local data, not wakeup_cpu data */
@@ -290,6 +353,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
smp_wmb();
tracer_enabled = 1;
+ register_ftrace_function(&trace_ops);
return;
fail_deprobe_wake_new:
@@ -305,6 +369,7 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
+ unregister_ftrace_function(&trace_ops);
marker_probe_unregister("kernel_sched_schedule",
sched_switch_callback,
&wakeup_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5588ecc40985..0911b7e073bf 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -28,6 +28,7 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
page = list_entry(data->trace_pages.next, struct page, lru);
entries = page_address(page);
+ check_pages(data);
if (head_page(data) != entries)
goto failed;