Skip to content

Commit

Permalink
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm…
Browse files Browse the repository at this point in the history
…/linux/kernel/git/tip/tip

Ingo writes:
  "scheduler fixes:

   These fixes address a rather involved performance regression between
   v4.17->v4.19 in the sched/numa auto-balancing code. Since distros
   really need this fix we accelerated it to sched/urgent for a faster
   upstream merge.

   NUMA scheduling and balancing performance is now largely back to
   v4.17 levels, without reintroducing the NUMA placement bugs that
   v4.18 and v4.19 fixed.

   Many thanks to Srikar Dronamraju, Mel Gorman and Jirka Hladky, for
   reporting, testing, re-testing and solving this rather complex set of
   bugs."

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task
  mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration
  sched/numa: Avoid task migration for small NUMA improvement
  mm/migrate: Use spin_trylock() while resetting rate limit
  sched/numa: Limit the conditions where scan period is reset
  sched/numa: Reset scan rate whenever task moves across nodes
  sched/numa: Pass destination CPU as a parameter to migrate_task_rq
  sched/numa: Stop multiple tasks from moving to the CPU at the same time
  • Loading branch information
gregkh committed Oct 5, 2018
2 parents 1df377d + 37355bd commit 8be6737
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 108 deletions.
6 changes: 0 additions & 6 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -671,12 +671,6 @@ typedef struct pglist_data {
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;

/* Rate limiting time interval */
unsigned long numabalancing_migrate_next_window;

/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
/*
* This is a per-node reserve of pages that are not available
Expand Down
27 changes: 0 additions & 27 deletions include/trace/events/migrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages,
__print_symbolic(__entry->mode, MIGRATE_MODE),
__print_symbolic(__entry->reason, MIGRATE_REASON))
);

TRACE_EVENT(mm_numa_migrate_ratelimit,

TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),

TP_ARGS(p, dst_nid, nr_pages),

TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN)
__field( pid_t, pid)
__field( int, dst_nid)
__field( unsigned long, nr_pages)
),

TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->dst_nid = dst_nid;
__entry->nr_pages = nr_pages;
),

TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
__entry->comm,
__entry->pid,
__entry->dst_nid,
__entry->nr_pages)
);
#endif /* _TRACE_MIGRATE_H */

/* This part must be outside protection */
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p);
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
perf_event_task_migrate(p);
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/deadline.c
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
return cpu;
}

static void migrate_task_rq_dl(struct task_struct *p)
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
struct rq *rq;

Expand Down
104 changes: 91 additions & 13 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int last_cpupid, this_cpupid;

this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;

/*
* Multi-stage node selection is used in conjunction with a periodic
Expand All @@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* This quadric squishes small probabilities, making it less likely we
* act on an unlikely task<->page relation.
*/
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) &&
cpupid_to_nid(last_cpupid) != dst_nid)
return false;
Expand Down Expand Up @@ -1514,6 +1524,21 @@ struct task_numa_env {
static void task_numa_assign(struct task_numa_env *env,
struct task_struct *p, long imp)
{
struct rq *rq = cpu_rq(env->dst_cpu);

/* Bail out if run-queue part of active NUMA balance. */
if (xchg(&rq->numa_migrate_on, 1))
return;

/*
* Clear previous best_cpu/rq numa-migrate flag, since task now
* found a better CPU to move/swap.
*/
if (env->best_cpu != -1) {
rq = cpu_rq(env->best_cpu);
WRITE_ONCE(rq->numa_migrate_on, 0);
}

if (env->best_task)
put_task_struct(env->best_task);
if (p)
Expand Down Expand Up @@ -1552,6 +1577,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
return (imb > old_imb);
}

/*
* Maximum NUMA importance can be 1998 (2*999);
* SMALLIMP @ 30 would be close to 1998/64.
* Used to deter task migration.
*/
#define SMALLIMP 30

/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
Expand All @@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env,
long moveimp = imp;
int dist = env->dist;

if (READ_ONCE(dst_rq->numa_migrate_on))
return;

rcu_read_lock();
cur = task_rcu_dereference(&dst_rq->curr);
if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
Expand All @@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto unlock;

if (!cur) {
if (maymove || imp > env->best_imp)
if (maymove && moveimp >= env->best_imp)
goto assign;
else
goto unlock;
Expand Down Expand Up @@ -1625,15 +1660,21 @@ static void task_numa_compare(struct task_numa_env *env,
task_weight(cur, env->dst_nid, dist);
}

if (imp <= env->best_imp)
goto unlock;

if (maymove && moveimp > imp && moveimp > env->best_imp) {
imp = moveimp - 1;
imp = moveimp;
cur = NULL;
goto assign;
}

/*
* If the NUMA importance is less than SMALLIMP,
* task migration might only result in ping pong
* of tasks and also hurt performance due to cache
* misses.
*/
if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
goto unlock;

/*
* In the overloaded case, try and keep the load balanced.
*/
Expand Down Expand Up @@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_cpu = -1,
};
struct sched_domain *sd;
struct rq *best_rq;
unsigned long taskweight, groupweight;
int nid, ret, dist;
long taskimp, groupimp;
Expand Down Expand Up @@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;

/*
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
p->numa_scan_period = task_scan_start(p);

best_rq = cpu_rq(env.best_cpu);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);
if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
return ret;
}

ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
WRITE_ONCE(best_rq->numa_migrate_on, 0);

if (ret != 0)
trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Expand Down Expand Up @@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}

static void update_scan_period(struct task_struct *p, int new_cpu)
{
int src_nid = cpu_to_node(task_cpu(p));
int dst_nid = cpu_to_node(new_cpu);

if (!static_branch_likely(&sched_numa_balancing))
return;

if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
return;

if (src_nid == dst_nid)
return;

/*
* Allow resets if faults have been trapped before one scan
* has completed. This is most likely due to a new task that
* is pulled cross-node due to wakeups or load balancing.
*/
if (p->numa_scan_seq) {
/*
* Avoid scan adjustments if moving to the preferred
* node or if the task was not previously running on
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
(p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
return;
}

p->numa_scan_period = task_scan_start(p);
}

#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
Expand All @@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}

static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}

#endif /* CONFIG_NUMA_BALANCING */

static void
Expand Down Expand Up @@ -6275,7 +6351,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static void migrate_task_rq_fair(struct task_struct *p)
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* As blocked tasks retain absolute vruntime the migration needs to
Expand Down Expand Up @@ -6328,6 +6404,8 @@ static void migrate_task_rq_fair(struct task_struct *p)

/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;

update_scan_period(p, new_cpu);
}

static void task_dead_fair(struct task_struct *p)
Expand Down
3 changes: 2 additions & 1 deletion kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,7 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Expand Down Expand Up @@ -1523,7 +1524,7 @@ struct sched_class {

#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

void (*task_woken)(struct rq *this_rq, struct task_struct *task);

Expand Down
57 changes: 0 additions & 57 deletions mm/migrate.c
Original file line number Diff line number Diff line change
Expand Up @@ -1855,46 +1855,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
return newpage;
}

/*
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);

/* Returns true if the node is migrate rate-limited after the update */
static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
unsigned long nr_pages)
{
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
spin_lock(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
spin_unlock(&pgdat->numabalancing_migrate_lock);
}
if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
nr_pages);
return true;
}

/*
* This is an unlocked non-atomic update so errors are possible.
* The consequences are failing to migrate when we potentiall should
* have which is not severe enough to warrant locking. If it is ever
* a problem, it can be converted to a per-cpu counter.
*/
pgdat->numabalancing_migrate_nr_pages += nr_pages;
return false;
}

static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
Expand Down Expand Up @@ -1967,14 +1927,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (page_is_file_cache(page) && PageDirty(page))
goto out;

/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
if (numamigrate_update_ratelimit(pgdat, 1))
goto out;

isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated)
goto out;
Expand Down Expand Up @@ -2021,14 +1973,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;

/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
goto out_dropref;

new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
Expand Down Expand Up @@ -2125,7 +2069,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,

out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
Expand Down
2 changes: 0 additions & 2 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -6197,8 +6197,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
static void pgdat_init_numabalancing(struct pglist_data *pgdat)
{
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
}
#else
static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
Expand Down

0 comments on commit 8be6737

Please sign in to comment.