summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-02-23 11:21:46 +0100
committerIngo Molnar <mingo@elte.hu>2011-02-23 11:21:46 +0100
commit44495b51ad23fa2cdbfaf9d2411a5dc99f9a06e3 (patch)
treead91f5de714f4abd8d7f9e83477d09ddb62f1ee4 /kernel
parent25f0c496aecd25051cdf5bac595729f8406c242c (diff)
parentc97cf42219b7b6037d2f96c27a5f114f2383f828 (diff)
Merge branch 'perf/core' into auto-latest
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/perf_event.c716
-rw-r--r--kernel/sched.c6
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
7 files changed, 737 insertions, 135 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
- int i;
struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
+ int i;
/*
* Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
+
+ if (run_callbacks && need_forkexit_callback) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit) {
+ struct cgroup *old_cgrp =
+ rcu_dereference_raw(cg->subsys[i])->cgroup;
+ struct cgroup *cgrp = task_cgroup(tsk, i);
+ ss->exit(ss, cgrp, old_cgrp, tsk);
+ }
+ }
+ }
task_unlock(tsk);
+
if (cg)
put_css_set_taskexit(cg);
}
@@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+ struct inode *inode;
+ struct cgroup_subsys_state *css;
+
+ inode = f->f_dentry->d_inode;
+ /* check in cgroup filesystem dir */
+ if (inode->i_op != &cgroup_dir_inode_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry);
+ css = cgrp->subsys[id];
+ return css ? css : ERR_PTR(-ENOENT);
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282fa50e..a0a6987fabc4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
return data.ret;
}
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+ PERF_FLAG_FD_OUTPUT |\
+ PERF_FLAG_PID_CGROUP)
+
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+atomic_t perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
@@ -140,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
/*
* max perf event sample rate
*/
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+ DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+
+ return 0;
+}
static atomic64_t perf_event_id;
@@ -148,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
@@ -162,6 +193,338 @@ static inline u64 perf_clock(void)
return local_clock();
}
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+ css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+ perf_put_cgroup(event);
+ event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ struct perf_cgroup_info *t;
+
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+ struct perf_cgroup_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ info = this_cpu_ptr(cgrp->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+ if (cgrp_out)
+ __update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
+ /*
+ * do not update time when cgroup is not active
+ */
+ if (!event->cgrp || cgrp != event->cgrp)
+ return;
+
+ __update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+ struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
+
+ if (!task)
+ return;
+
+ cgrp = perf_cgroup_from_task(task);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = now;
+}
+
+#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /*
+ * disable interrupts to avoid geting nr_cgroup
+ * changes via __perf_event_disable(). Also
+ * avoids preemption.
+ */
+ local_irq_save(flags);
+
+ /*
+ * we reschedule only in the presence of cgroup
+ * constrained events.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ /*
+ * perf_cgroup_events says at least one
+ * context on this CPU has cgroup events.
+ *
+ * ctx->nr_cgroups reports the number of cgroup
+ * events for a context.
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
+
+ if (mode & PERF_CGROUP_SWIN) {
+ /* set cgrp before ctxsw in to
+ * allow event_filter_match() to not
+ * have to pass task around
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ }
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ struct perf_cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct file *file;
+ int ret = 0, fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+
+ css = cgroup_css_from_dir(file, perf_subsys_id);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ event->cgrp = cgrp;
+
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a task belongs
+ * to only one perf cgroup at a time
+ */
+ if (group_leader && group_leader->cgrp != cgrp) {
+ perf_detach_cgroup(event);
+ ret = -EINVAL;
+ } else {
+ /* must be done before we fput() the file */
+ perf_get_cgroup(event);
+ }
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_cgroup_info *t;
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+ /*
+ * when the current task's perf cgroup does not match
+ * the event's, we need to remember to call the
+ * perf_mark_enable() function the first time a task with
+ * a matching perf cgroup is scheduled in.
+ */
+ if (is_cgroup_event(event) && !perf_cgroup_match(event))
+ event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->cgrp_defer_enabled)
+ return;
+
+ event->cgrp_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->cgrp_defer_enabled = 0;
+ }
+ }
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+}
+#endif
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +706,10 @@ static void update_context_time(struct perf_event_context *ctx)
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time(event);
+
return ctx ? ctx->time : 0;
}
@@ -357,9 +724,20 @@ static void update_event_times(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
-
- if (ctx->is_active)
+ /*
+ * in cgroup mode, time_enabled represents
+ * the time the event was enabled AND active
+ * tasks were in the monitored cgroup. This is
+ * independent of the activity of the context as
+ * there may be a mix of cgroup and non-cgroup events.
+ *
+ * That is why we treat cgroup events differently
+ * here.
+ */
+ if (is_cgroup_event(event))
run_end = perf_event_time(event);
+ else if (ctx->is_active)
+ run_end = ctx->time;
else
run_end = event->tstamp_stopped;
@@ -371,6 +749,7 @@ static void update_event_times(struct perf_event *event)
run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
+
}
/*
@@ -419,6 +798,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups++;
+ /*
+ * one more event:
+ * - that has cgroup constraint on event->cpu
+ * - that may need work on context switch
+ */
+ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_inc(&perf_sched_events);
+ }
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +935,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups--;
+ atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_dec(&perf_sched_events);
+ }
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -616,7 +1012,8 @@ out:
static inline int
event_filter_match(struct perf_event *event)
{
- return event->cpu == -1 || event->cpu == smp_processor_id();
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event);
}
static void
@@ -634,7 +1031,7 @@ event_sched_out(struct perf_event *event,
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
&& !event_filter_match(event)) {
- delta = ctx->time - event->tstamp_stopped;
+ delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
}
@@ -678,12 +1075,6 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
/*
* Cross CPU call to remove a performance event
*
@@ -783,6 +1174,7 @@ static int __perf_event_disable(void *info)
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1243,41 @@ retry:
raw_spin_unlock_irq(&ctx->lock);
}
+static void perf_set_shadow_time(struct perf_event *event,
+ struct perf_event_context *ctx,
+ u64 tstamp)
+{
+ /*
+ * use the correct time source for the time snapshot
+ *
+ * We could get by without this by leveraging the
+ * fact that to get to this function, the caller
+ * has most likely already called update_context_time()
+ * and update_cgrp_time_xx() and thus both timestamp
+ * are identical (or very close). Given that tstamp is,
+ * already adjusted for cgroup, we could say that:
+ * tstamp - ctx->timestamp
+ * is equivalent to
+ * tstamp - cgrp->timestamp.
+ *
+ * Then, in perf_output_read(), the calculation would
+ * work with no changes because:
+ * - event is guaranteed scheduled in
+ * - no scheduled out in between
+ * - thus the timestamp would be the same
+ *
+ * But this is a bit hairy.
+ *
+ * So instead, we have an explicit cgroup call to remain
+ * within the time time source all along. We believe it
+ * is cleaner and simpler to understand.
+ */
+ if (is_cgroup_event(event))
+ perf_cgroup_set_shadow_time(event, tstamp);
+ else
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1318,7 @@ event_sched_in(struct perf_event *event,
event->tstamp_running += tstamp - event->tstamp_stopped;
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ perf_set_shadow_time(event, ctx, tstamp);
if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -1012,7 +1439,8 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
-static void perf_event_context_sched_in(struct perf_event_context *ctx);
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *tsk);
/*
* Cross CPU call to install and enable a performance event
@@ -1033,11 +1461,17 @@ static int __perf_install_in_context(void *info)
* which do context switches with IRQs enabled.
*/
if (ctx->task && !cpuctx->task_ctx)
- perf_event_context_sched_in(ctx);
+ perf_event_context_sched_in(ctx, ctx->task);
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
update_context_time(ctx);
+ /*
+ * update cgrp time only if current cgrp
+ * matches event->cgrp. Must be done before
+ * calling add_event_to_ctx()
+ */
+ update_cgrp_time_from_event(event);
add_event_to_ctx(event, ctx);
@@ -1175,10 +1609,19 @@ static int __perf_event_enable(void *info)
if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock;
+
+ /*
+ * set current task's cgroup time reference point
+ */
+ perf_cgroup_set_timestamp(current, perf_clock());
+
__perf_event_mark_enabled(event, ctx);
- if (!event_filter_match(event))
+ if (!event_filter_match(event)) {
+ if (is_cgroup_event(event))
+ perf_cgroup_defer_enabled(event);
goto unlock;
+ }
/*
* If the event is in a group and isn't the group leader,
@@ -1307,6 +1750,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
goto out;
update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
goto out;
@@ -1496,6 +1940,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
+
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch out PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_out(task);
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1986,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -1566,6 +2022,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -1576,15 +2036,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
+ u64 now;
+
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
if (likely(!ctx->nr_events))
goto out;
- ctx->timestamp = perf_clock();
-
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, now);
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -1601,11 +2065,12 @@ out:
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task);
}
static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2082,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
if (cpuctx->task_ctx == ctx)
return;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, NULL);
cpuctx->task_ctx = ctx;
}
-static void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
@@ -1637,9 +2103,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
cpuctx->task_ctx = ctx;
@@ -1672,8 +2138,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
if (likely(!ctx))
continue;
- perf_event_context_sched_in(ctx);
+ perf_event_context_sched_in(ctx, task);
}
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch in PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_in(task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2346,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
if (ctx)
rotate_ctx(ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
if (ctx)
task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
@@ -1952,7 +2425,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
raw_spin_unlock(&ctx->lock);
- perf_event_context_sched_in(ctx);
+ perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
}
@@ -1977,8 +2450,10 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active)
+ if (ctx->is_active) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
@@ -2009,8 +2484,10 @@ static u64 perf_event_read(struct perf_event *event)
* (e.g., thread is blocked), in that case
* we cannot update context time
*/
- if (ctx->is_active)
+ if (ctx->is_active) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -2395,7 +2872,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec(&perf_task_events);
+ jump_label_dec(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2411,6 +2888,9 @@ static void free_event(struct perf_event *event)
event->buffer = NULL;
}
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
if (event->destroy)
event->destroy(event);
@@ -4478,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
if (unlikely(!is_sampling_event(event)))
return 0;
- if (!throttle) {
- hwc->interrupts++;
- } else {
- if (hwc->interrupts != MAX_INTERRUPTS) {
- hwc->interrupts++;
- if (HZ * hwc->interrupts >
- (u64)sysctl_perf_event_sample_rate) {
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
- } else {
- /*
- * Keep re-disabling events even though on the previous
- * pass we disabled it - just in case we raced with a
- * sched-in and the event got enabled again:
- */
+ if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
+ if (throttle) {
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
ret = 1;
}
- }
+ } else
+ hwc->interrupts++;
if (event->attr.freq) {
u64 now = perf_clock();
@@ -5134,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
u64 period;
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
event->pmu->read(event);
perf_sample_data_init(&data, 0);
@@ -5160,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
-
period = local64_read(&hwc->period_left);
if (period) {
if (period < 0)
@@ -5189,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
}
}
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ event->attr.freq = 0;
+ }
+}
+
/*
* Software event: cpu wall time clock
*/
@@ -5241,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5300,6 +5795,7 @@ static void task_clock_event_read(struct perf_event *event)
if (!in_nmi()) {
update_context_time(event->ctx);
+ update_cgrp_time_from_event(event);
time = event->ctx->time;
} else {
u64 now = perf_clock();
@@ -5318,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5725,7 +6223,7 @@ done:
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_task_events);
+ jump_label_inc(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -5900,7 +6398,7 @@ SYSCALL_DEFINE5(perf_event_open,
int err;
/* for future expandability... */
- if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+ if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6415,15 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /*
+ * In cgroup mode, the pid argument is used to pass the fd
+ * opened to the cgroup directory in cgroupfs. The cpu argument
+ * designates the cpu on which to monitor threads from that
+ * cgroup.
+ */
+ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+ return -EINVAL;
+
event_fd = get_unused_fd_flags(O_RDWR);
if (event_fd < 0)
return event_fd;
@@ -5934,7 +6441,7 @@ SYSCALL_DEFINE5(perf_event_open,
group_leader = NULL;
}
- if (pid != -1) {
+ if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
@@ -5948,6 +6455,12 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
+ if (flags & PERF_FLAG_PID_CGROUP) {
+ err = perf_cgroup_connect(pid, event, &attr, group_leader);
+ if (err)
+ goto err_alloc;
+ }
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -6808,3 +7321,92 @@ unlock:
return ret;
}
device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+ struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+ struct perf_cgroup_info *t;
+ int c;
+
+ jc = kmalloc(sizeof(*jc), GFP_KERNEL);
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+
+ memset(jc, 0, sizeof(*jc));
+
+ jc->info = alloc_percpu(struct perf_cgroup_info);
+ if (!jc->info) {
+ kfree(jc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for_each_possible_cpu(c) {
+ t = per_cpu_ptr(jc->info, c);
+ t->time = 0;
+ t->timestamp = 0;
+ }
+ return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+ jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+ struct perf_cgroup, css);
+ free_percpu(jc->info);
+ kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+ struct task_struct *task = info;
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task,
+ bool threadgroup)
+{
+ perf_cgroup_move(task);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ perf_cgroup_move(c);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+ .name = "perf_event",
+ .subsys_id = perf_subsys_id,
+ .create = perf_cgroup_create,
+ .destroy = perf_cgroup_destroy,
+ .exit = perf_cgroup_exit,
+ .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/sched.c b/kernel/sched.c
index 859982e477ec..850bdb53589a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
struct task_group *tg;
struct cgroup_subsys_state *css;
- if (p->flags & PF_EXITING)
- return &root_task_group;
-
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
@@ -9081,7 +9078,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb7c830f7faa..40eb4bf48ca4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -941,7 +941,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = perf_proc_update_handler,
},
#endif
#ifdef CONFIG_KMEMCHECK
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..8dc8da6733f9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
mutex_lock(&trace_types_lock);
if (tracer_enabled ^ val) {
+
+ /* Only need to warn if this is used to change the state */
+ WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
if (val) {
tracer_enabled = 1;
if (current_trace->start)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
ctx_trace = tr;
}
-static void stop_sched_trace(struct trace_array *tr)
-{
- tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
- ctx_trace = tr;
- tracing_reset_online_cpus(tr);
- tracing_start_sched_switch_record();
- return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
- if (sched_ref)
- stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
- sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
- sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
- .name = "sched_switch",
- .init = sched_switch_trace_init,
- .reset = sched_switch_trace_reset,
- .start = sched_switch_trace_start,
- .stop = sched_switch_trace_stop,
- .wait_pipe = poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
- return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
static struct syscall_metadata **syscalls_metadata;
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with "SyS" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ return !strcmp(sym + 3, name + 3);
+}
+#endif
+
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
stop = __stop_syscalls_metadata;
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+ if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+ return NULL;
+
for ( ; start < stop; start++) {
- /*
- * Only compare after the "sys" prefix. Archs that use
- * syscall wrappers may have syscalls symbols aliases prefixed
- * with "SyS" instead of "sys", leading to an unwanted
- * mismatch.
- */
- if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+ if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
return *start;
}
return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int init_syscall_trace(struct ftrace_event_call *call)
{
int id;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls) {
+ pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+ ((struct syscall_metadata *)call->data)->name);
+ return -ENOSYS;
+ }
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
return id;
}
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
{
return (unsigned long)sys_call_table[nr];
}