Skip to content

Commit

Permalink
psi: Reduce calls to sched_clock() in psi
Browse files Browse the repository at this point in the history
We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.747 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.516 [sec]

       3.516689 usecs/op
         284358 ops/sec

After the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.640 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.329 [sec]

       3.329820 usecs/op
         300316 ops/sec

Signed-off-by: Shakeel Butt <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
  • Loading branch information
shakeelb authored and Peter Zijlstra committed Mar 23, 2021
1 parent 2a2f80f commit df77430
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions kernel/sched/psi.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
}

static void record_times(struct psi_group_cpu *groupc, int cpu)
static void record_times(struct psi_group_cpu *groupc, u64 now)
{
u32 delta;
u64 now;

now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;

Expand All @@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
}

static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
unsigned int clear, unsigned int set, u64 now,
bool wake_clock)
{
struct psi_group_cpu *groupc;
Expand All @@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
*/
write_seqcount_begin(&groupc->seq);

record_times(groupc, cpu);
record_times(groupc, now);

for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
Expand Down Expand Up @@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
u64 now;

if (!task->pid)
return;

psi_flags_change(task, clear, set);

now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
Expand All @@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
wake_clock = false;

while ((group = iterate_groups(task, &iter)))
psi_group_change(group, cpu, clear, set, wake_clock);
psi_group_change(group, cpu, clear, set, now, wake_clock);
}

void psi_task_switch(struct task_struct *prev, struct task_struct *next,
Expand All @@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
u64 now = cpu_clock(cpu);

if (next->pid) {
bool identical_state;
Expand All @@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
break;
}

psi_group_change(group, cpu, 0, TSK_ONCPU, true);
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
}

Expand All @@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,

iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
psi_group_change(group, cpu, clear, set, true);
psi_group_change(group, cpu, clear, set, now, true);

/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
Expand All @@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
if (sleep) {
clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
psi_group_change(group, cpu, clear, set, true);
psi_group_change(group, cpu, clear, set, now, true);
}
}
}
Expand Down

0 comments on commit df77430

Please sign in to comment.