Skip to content

Commit

Permalink
perf/core: Set cgroup in CPU contexts for new cgroup events
Browse files Browse the repository at this point in the history
There's a perf stat bug easy to observer on a machine with only one cgroup:

  $ perf stat -e cycles -I 1000 -C 0 -G /
  #          time             counts unit events
      1.000161699      <not counted>      cycles                    /
      2.000355591      <not counted>      cycles                    /
      3.000565154      <not counted>      cycles                    /
      4.000951350      <not counted>      cycles                    /

We'd expect some output there.

The underlying problem is that there is an optimization in
perf_cgroup_sched_{in,out}() that skips the switch of cgroup events
if the old and new cgroups in a task switch are the same.

This optimization interacts with the current code in two ways
that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a
cgroup event matches the current task. These are:

  1. On creation of the first cgroup event in a CPU: In current code,
  cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the
  aforesaid optimization, perf_cgroup_sched_in will run until the next
  cgroup switches in that CPU. This may happen late or never happen,
  depending on system's number of cgroups, CPU load, etc.

  2. On deletion of the last cgroup event in a cpuctx: In list_del_event,
  cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in
  because cpuctx->cgrp == NULL until a cgroup switch occurs and
  perf_cgroup_sched_in is executed (updating cpuctx->cgrp).

This patch fixes both problems by setting cpuctx->cgrp in list_add_event,
mirroring what list_del_event does when removing a cgroup event from CPU
context, as introduced in:

  commit 68cacd2 ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()")

With this patch, cpuctx->cgrp is always set/clear when installing/removing
the first/last cgroup event in/from the CPU context. With cpuctx->cgrp
correctly set, event_filter_match works as intended when events are
sched in/out.

After the fix, the output is as expected:

  $ perf stat -e cycles -I 1000 -a -G /
  #         time             counts unit events
     1.004699159          627342882      cycles                    /
     2.007397156          6152726      cycles                    /
     3.010019057          616726074      cycles                    /

Signed-off-by: David Carrillo-Cisneros <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Alexander Shishkin <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Kan Liang <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Paul Turner <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Stephane Eranian <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vegard Nossum <[email protected]>
Cc: Vince Weaver <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
David Carrillo-Cisneros authored and Ingo Molnar committed Aug 10, 2016
1 parent 0b8f1e2 commit db4a835
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 18 deletions.
4 changes: 4 additions & 0 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,9 @@ struct perf_event_context {
u64 parent_gen;
u64 generation;
int pin_count;
#ifdef CONFIG_CGROUP_PERF
int nr_cgroups; /* cgroup evts */
#endif
void *task_ctx_data; /* pmu specific data */
struct rcu_head rcu_head;
};
Expand All @@ -769,7 +771,9 @@ struct perf_cpu_context {
unsigned int hrtimer_active;

struct pmu *unique_pmu;
#ifdef CONFIG_CGROUP_PERF
struct perf_cgroup *cgrp;
#endif
};

struct perf_output_handle {
Expand Down
54 changes: 36 additions & 18 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
}
}
}

/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
*/
static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
struct perf_cpu_context *cpuctx;

if (!is_cgroup_event(event))
return;

if (add && ctx->nr_cgroups++)
return;
else if (!add && --ctx->nr_cgroups)
return;
/*
* Because cgroup events are always per-cpu events,
* this will always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
cpuctx->cgrp = add ? event->cgrp : NULL;
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
Expand Down Expand Up @@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}

static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
}

#endif

/*
Expand Down Expand Up @@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{

lockdep_assert_held(&ctx->lock);

WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
Expand All @@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}

if (is_cgroup_event(event))
ctx->nr_cgroups++;
list_update_cgroup_event(event, ctx, true);

list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
Expand Down Expand Up @@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;

WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);

Expand All @@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)

event->attach_state &= ~PERF_ATTACH_CONTEXT;

if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
/*
* Because cgroup events are always per-cpu events, this will
* always be called from the right CPU.
*/
cpuctx = __get_cpu_context(ctx);
/*
* If there are no more cgroup events then clear cgrp to avoid
* stale pointer in update_cgrp_time_from_cpuctx().
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
}
list_update_cgroup_event(event, ctx, false);

ctx->nr_events--;
if (event->attr.inherit_stat)
Expand Down

0 comments on commit db4a835

Please sign in to comment.