Skip to content

Commit

Permalink
sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
Browse files Browse the repository at this point in the history
Commit d670ec1 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc
test case in cost of breaking another one. After that commit, calling
clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result
of Y time being smaller than X time.

Reproducer/tester can be found further below, it can be compiled and ran by:

	gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread
	while ./tst-cpuclock2 ; do : ; done

This reproducer, when running on a buggy kernel, will complain
about "clock_gettime difference too small".

Issue happens because on start in thread_group_cputimer() we initialize
sum_exec_runtime of cputimer with threads runtime not yet accounted and
then add the threads runtime to running cputimer again on scheduler
tick, making it's sum_exec_runtime bigger than actual threads runtime.

KOSAKI Motohiro posted a fix for this problem, but that patch was never
applied: https://lkml.org/lkml/2013/5/26/191 .

This patch takes different approach to cure the problem. It calls
update_curr() when cputimer starts, that assure we will have updated
stats of running threads and on the next schedule tick we will account
only the runtime that elapsed from cputimer start. That also assure we
have consistent state between cpu times of individual threads and cpu
time of the process consisted by those threads.

Full reproducer (tst-cpuclock2.c):

	#define _GNU_SOURCE
	#include <unistd.h>
	#include <sys/syscall.h>
	#include <stdio.h>
	#include <time.h>
	#include <pthread.h>
	#include <stdint.h>
	#include <inttypes.h>

	/* Parameters for the Linux kernel ABI for CPU clocks.  */
	#define CPUCLOCK_SCHED          2
	#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
		((~(clockid_t) (pid) << 3) | (clockid_t) (clock))

	static pthread_barrier_t barrier;

	/* Help advance the clock.  */
	static void *chew_cpu(void *arg)
	{
		pthread_barrier_wait(&barrier);
		while (1) ;

		return NULL;
	}

	/* Don't use the glibc wrapper.  */
	static int do_nanosleep(int flags, const struct timespec *req)
	{
		clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED);

		return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL);
	}

	static int64_t tsdiff(const struct timespec *before, const struct timespec *after)
	{
		int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec;
		int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec;

		return after_i - before_i;
	}

	int main(void)
	{
		int result = 0;
		pthread_t th;

		pthread_barrier_init(&barrier, NULL, 2);

		if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) {
			perror("pthread_create");
			return 1;
		}

		pthread_barrier_wait(&barrier);

		/* The test.  */
		struct timespec before, after, sleeptimeabs;
		int64_t sleepdiff, diffabs;
		const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 };

		/* The relative nanosleep.  Not sure why this is needed, but its presence
		   seems to make it easier to reproduce the problem.  */
		if (do_nanosleep(0, &sleeptime) != 0) {
			perror("clock_nanosleep");
			return 1;
		}

		/* Get the current time.  */
		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) {
			perror("clock_gettime[2]");
			return 1;
		}

		/* Compute the absolute sleep time based on the current time.  */
		uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec;
		sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000;
		sleeptimeabs.tv_nsec = nsec % 1000000000;

		/* Sleep for the computed time.  */
		if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) {
			perror("absolute clock_nanosleep");
			return 1;
		}

		/* Get the time after the sleep.  */
		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) {
			perror("clock_gettime[3]");
			return 1;
		}

		/* The time after sleep should always be equal to or after the absolute sleep
		   time passed to clock_nanosleep.  */
		sleepdiff = tsdiff(&sleeptimeabs, &after);
		if (sleepdiff < 0) {
			printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff);
			result = 1;

			printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec);
			printf("After  %llu.%09llu\n", after.tv_sec, after.tv_nsec);
			printf("Sleep  %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec);
		}

		/* The difference between the timestamps taken before and after the
		   clock_nanosleep call should be equal to or more than the duration of the
		   sleep.  */
		diffabs = tsdiff(&before, &after);
		if (diffabs < sleeptime.tv_nsec) {
			printf("clock_gettime difference too small: %" PRId64 "\n", diffabs);
			result = 1;
		}

		pthread_cancel(th);

		return result;
	}

Signed-off-by: Stanislaw Gruszka <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: KOSAKI Motohiro <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
  • Loading branch information
Stanislaw Gruszka authored and Ingo Molnar committed Nov 16, 2014
1 parent 23cfa36 commit 6e99891
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 27 deletions.
38 changes: 11 additions & 27 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2474,31 +2474,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

/*
* Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
*
* Called with task_rq_lock() held on @rq.
*/
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;

/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}

return ns;
}

/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
Expand All @@ -2508,7 +2483,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
u64 ns = 0;
u64 ns;

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
Expand All @@ -2527,7 +2502,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#endif

rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);

return ns;
Expand Down
2 changes: 2 additions & 0 deletions kernel/sched/deadline.c
Original file line number Diff line number Diff line change
Expand Up @@ -1701,4 +1701,6 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,

.update_curr = update_curr_dl,
};
7 changes: 7 additions & 0 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}

static void update_curr_fair(struct rq *rq)
{
update_curr(cfs_rq_of(&rq->curr->se));
}

static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
Expand Down Expand Up @@ -7956,6 +7961,8 @@ const struct sched_class fair_sched_class = {

.get_rr_interval = get_rr_interval_fair,

.update_curr = update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
Expand Down
2 changes: 2 additions & 0 deletions kernel/sched/rt.c
Original file line number Diff line number Diff line change
Expand Up @@ -2128,6 +2128,8 @@ const struct sched_class rt_sched_class = {

.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,

.update_curr = update_curr_rt,
};

#ifdef CONFIG_SCHED_DEBUG
Expand Down
2 changes: 2 additions & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);

void (*update_curr) (struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
Expand Down

0 comments on commit 6e99891

Please sign in to comment.