Skip to content

Commit

Permalink
Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: do not count frozen tasks toward load
  sched: refresh MAINTAINERS entry
  sched: Print sched_group::__cpu_power in sched_domain_debug
  cpuacct: add per-cgroup utime/stime statistics
  posixtimers, sched: Fix posix clock monotonicity
  sched_rt: don't allocate cpumask in fastpath
  cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2
  • Loading branch information
torvalds committed Apr 9, 2009
2 parents 422a253 + e3c8ca8 commit 17b2e9b
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 34 deletions.
18 changes: 18 additions & 0 deletions Documentation/cgroups/cpuacct.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also.

cpuacct.stat file lists a few statistics which further divide the
CPU time obtained by the cgroup into user and system times. Currently
the following statistics are supported:

user: Time spent by tasks of the cgroup in user mode.
system: Time spent by tasks of the cgroup in kernel mode.

user and system are in USER_HZ unit.

cpuacct controller uses percpu_counter interface to collect user and
system times. This has two side effects:

- It is theoretically possible to see wrong values for user and system times.
This is because percpu_counter_read() on 32bit systems isn't safe
against concurrent writes.
- It is possible to see slightly outdated values for user and system times
due to the batch processing nature of percpu_counter.
4 changes: 2 additions & 2 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -3873,8 +3873,8 @@ S: Maintained
SCHEDULER
P: Ingo Molnar
M: [email protected]
P: Robert Love [the preemptible kernel bits]
M: [email protected]
P: Peter Zijlstra
M: [email protected]
L: [email protected]
S: Maintained

Expand Down
3 changes: 2 additions & 1 deletion include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
#define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \
((task->state & TASK_UNINTERRUPTIBLE) != 0)
((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
(task->flags & PF_FROZEN) == 0)

#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
Expand Down
7 changes: 4 additions & 3 deletions kernel/posix-cpu-timers.c
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p);
break;
case CPUCLOCK_SCHED:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
cpu->sched = task_sched_runtime(p);
break;
}
return 0;
Expand Down Expand Up @@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{
struct task_cputime cputime;

thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime;
break;
case CPUCLOCK_SCHED:
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
cpu->sched = thread_group_sched_runtime(p);
break;
}
return 0;
Expand Down
160 changes: 145 additions & 15 deletions kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct rq_iterator *iterator);
#endif

/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */

CPUACCT_STAT_NSTATS,
};

#ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val) {}
#endif

static inline void inc_cpu_load(struct rq *rq, unsigned long load)
Expand Down Expand Up @@ -4511,26 +4523,75 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);

/*
* Return any ns on the sched_clock that have not yet been banked in
* Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
*
* Called with task_rq_lock() held on @rq.
*/
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;

if (task_current(rq, p)) {
update_rq_clock(rq);
ns = rq->clock - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}

return ns;
}

unsigned long long task_delta_exec(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
u64 ns = 0;

rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

if (task_current(rq, p)) {
u64 delta_exec;
return ns;
}

update_rq_clock(rq);
delta_exec = rq->clock - p->se.exec_start;
if ((s64)delta_exec > 0)
ns = delta_exec;
}
/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
*/
unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
u64 ns = 0;

rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
}

/*
* Return sum_exec_runtime for the thread group.
* In case the task is currently running, return the sum plus current's
* pending runtime that have not been accounted yet.
*
* Note that the thread group might have other running tasks as well,
* so the return value not includes other pending runtime that other
* running tasks might have.
*/
unsigned long long thread_group_sched_runtime(struct task_struct *p)
{
struct task_cputime totals;
unsigned long flags;
struct rq *rq;
u64 ns;

rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);

return ns;
Expand Down Expand Up @@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
cpustat->nice = cputime64_add(cpustat->nice, tmp);
else
cpustat->user = cputime64_add(cpustat->user, tmp);

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
Expand Down Expand Up @@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else
cpustat->system = cputime64_add(cpustat->system, tmp);

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

/* Account for system time used */
acct_update_integrals(p);
}
Expand Down Expand Up @@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
printk(KERN_CONT " %s (__cpu_power = %d)", str,
group->__cpu_power);

group = group->next;
} while (group != sd->groups);
Expand Down Expand Up @@ -9925,6 +9991,7 @@ struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent;
};

Expand All @@ -9949,28 +10016,43 @@ static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
int i;

if (!ca)
return ERR_PTR(-ENOMEM);
goto out;

ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage) {
kfree(ca);
return ERR_PTR(-ENOMEM);
}
if (!ca->cpuusage)
goto out_free_ca;

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
if (percpu_counter_init(&ca->cpustat[i], 0))
goto out_free_counters;

if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent);

return &ca->css;

out_free_counters:
while (--i >= 0)
percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
out:
return ERR_PTR(-ENOMEM);
}

/* destroy an existing cpu accounting group */
static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int i;

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
kfree(ca);
}
Expand Down Expand Up @@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
return 0;
}

static const char *cpuacct_stat_desc[] = {
[CPUACCT_STAT_USER] = "user",
[CPUACCT_STAT_SYSTEM] = "system",
};

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int i;

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
s64 val = percpu_counter_read(&ca->cpustat[i]);
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[i], val);
}
return 0;
}

static struct cftype files[] = {
{
.name = "usage",
Expand All @@ -10067,7 +10168,10 @@ static struct cftype files[] = {
.name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read,
},

{
.name = "stat",
.read_map = cpuacct_stats_show,
},
};

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
Expand All @@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
return;

cpu = task_cpu(tsk);

rcu_read_lock();

ca = task_ca(tsk);

for (; ca; ca = ca->parent) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}

rcu_read_unlock();
}

/*
* Charge the system/user time to the task's accounting group.
*/
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val)
{
struct cpuacct *ca;

if (unlikely(!cpuacct_subsys.active))
return;

rcu_read_lock();
ca = task_ca(tsk);

do {
percpu_counter_add(&ca->cpustat[idx], val);
ca = ca->parent;
} while (ca);
rcu_read_unlock();
}

struct cgroup_subsys cpuacct_subsys = {
Expand Down
5 changes: 3 additions & 2 deletions kernel/sched_cpupri.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ static int convert_prio(int prio)
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
* @lowest_mask: A mask to fill in with selected CPUs
* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
*
* Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in
Expand All @@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;

cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
if (lowest_mask)
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
return 1;
}

Expand Down
15 changes: 4 additions & 11 deletions kernel/sched_rt.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)

static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
cpumask_var_t mask;

if (rq->curr->rt.nr_cpus_allowed == 1)
return;

if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
return;

if (p->rt.nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, mask))
goto free;
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;

if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
goto free;
if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;

/*
* There appears to be other cpus that can accept
Expand All @@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
requeue_task_rt(rq, p, 1);
resched_task(rq->curr);
free:
free_cpumask_var(mask);
}

#endif /* CONFIG_SMP */
Expand Down

0 comments on commit 17b2e9b

Please sign in to comment.